vision_language_multi_image.py 27.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
4
5
multi-image input on vision language models for text generation,
using the chat template defined by the model.
6
"""
7

8
import os
9
from argparse import Namespace
10
from dataclasses import asdict
11
from typing import NamedTuple, Optional
12

13
from huggingface_hub import snapshot_download
14
from PIL.Image import Image
15
from transformers import AutoProcessor, AutoTokenizer
16

17
from vllm import LLM, EngineArgs, SamplingParams
18
from vllm.lora.request import LoRARequest
19
20
21
22
23
24
25
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
26
27
28
29
30
31
32
33
34
35
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
36
37
38
]


39
class ModelRequestData(NamedTuple):
40
    engine_args: EngineArgs
41
    prompt: str
42
    image_data: list[Image]
43
44
45
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
46
47


48
49
50
51
52
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


53
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
54
    model_name = "rhymes-ai/Aria"
55
56
57
58
59
60
61
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
62
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
63
64
65
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
66
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
67

68
    return ModelRequestData(
69
        engine_args=engine_args,
70
71
72
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
73
    )
74

75

Jennifer Zhao's avatar
Jennifer Zhao committed
76
77
78
79
80
81
82
83
84
85
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
86
87
88
89
90
91
92
93
94
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
95
96
97

    processor = AutoProcessor.from_pretrained(model_name)

98
99
100
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
101
102
103
104
105
106
107
108

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


109
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
110
    model_name = "deepseek-ai/deepseek-vl2-tiny"
111

112
113
114
115
116
117
118
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
119

120
121
122
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
123
124
125
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
126
        engine_args=engine_args,
127
128
129
130
131
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


132
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
133
134
    model_name = "google/gemma-3-4b-it"

135
    engine_args = EngineArgs(
136
137
138
139
140
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
141
142

    placeholders = [{"type": "image", "image": url} for url in image_urls]
143
144
145
146
147
148
149
150
151
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
152
153
154

    processor = AutoProcessor.from_pretrained(model_name)

155
156
157
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
158
159

    return ModelRequestData(
160
        engine_args=engine_args,
161
162
163
164
165
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


166
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
167
    model_name = "h2oai/h2ovl-mississippi-800m"
168

169
    engine_args = EngineArgs(
170
171
172
173
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
174
        mm_processor_kwargs={"max_dynamic_patch": 4},
175
176
    )

177
178
179
180
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
181

182
183
184
185
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
186
187

    # Stop tokens for H2OVL-Mississippi
188
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
189
190
191
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
192
        engine_args=engine_args,
193
194
195
196
197
198
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


199
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
200
201
202
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
203
    engine_args = EngineArgs(
204
205
206
207
208
209
210
211
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
212
            "size": {"longest_edge": 2 * 364},
213
214
215
        },
    )

216
217
218
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
219
220
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
221
        engine_args=engine_args,
222
223
224
225
226
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


227
228
229
230
231
232
233
234
235
236
237
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
238
            "max_image_size": {"longest_edge": 384},
239
240
241
        },
    )

242
243
244
245
246
247
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
248
249
250
251
252
253
254
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


255
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
256
257
    model_name = "OpenGVLab/InternVL2-2B"

258
    engine_args = EngineArgs(
259
260
261
262
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
263
        mm_processor_kwargs={"max_dynamic_patch": 4},
264
265
    )

266
267
268
269
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
270

271
272
273
274
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
275
276
277
278

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
279
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
280
281
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
282

283
    return ModelRequestData(
284
        engine_args=engine_args,
285
286
287
288
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
289
290


291
292
293
294
295
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
296
        max_model_len=131072,
297
298
299
300
301
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
302
303
304
305
306
307
308
309
310
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
311
312
313

    processor = AutoProcessor.from_pretrained(model_name)

314
315
316
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
317
318
319
320
321
322
323
324

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


325
326
327
328
329
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
330
        trust_remote_code=True,
331
332
333
334
335
336
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
337
338
339
340
341
342
343
344
345
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
346

347
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
348

349
350
351
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
352
353
354
355
356
357
358
359

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


382
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
383
384
385
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
386
    engine_args = EngineArgs(
387
        model=model_name,
388
389
        max_model_len=8192,
        max_num_seqs=2,
390
391
392
        limit_mm_per_prompt={"image": len(image_urls)},
    )

393
394
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
395
    return ModelRequestData(
396
        engine_args=engine_args,
397
398
399
400
401
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


402
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
403
404
405
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
406
    engine_args = EngineArgs(
407
408
409
410
411
412
413
414
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

415
416
417
418
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
419

420
421
422
423
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
424
425

    return ModelRequestData(
426
        engine_args=engine_args,
427
428
429
430
431
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


432
433
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
434
435
436
437
438
439
440
441
442
443
444
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

445
446
447
448
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
449

450
451
452
453
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
454
455
456
457
458
459
460
461

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


462
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
463
464
465
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
466
    engine_args = EngineArgs(
467
468
469
470
471
472
473
474
475
476
477
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
478
        engine_args=engine_args,
479
480
481
482
483
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


484
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
485
486
487
488
489
490
491
492
493
494
495
496
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
497
    engine_args = EngineArgs(
498
499
500
501
502
503
504
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
505
506
507
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
508
509
510
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
511
        engine_args=engine_args,
512
513
514
515
516
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


517
518
519
520
521
522
523
524
525
526
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
527
    engine_args = EngineArgs(
528
529
        model=model_path,
        trust_remote_code=True,
530
        max_model_len=4096,
531
532
533
534
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
535
536
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
537
538
    )

539
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
540
541
542
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
543
        engine_args=engine_args,
544
545
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
546
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
547
548
549
    )


550
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
551
    model_name = "Qwen/Qwen-VL-Chat"
552
    engine_args = EngineArgs(
553
554
555
556
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
557
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
558
559
        limit_mm_per_prompt={"image": len(image_urls)},
    )
560
561
562
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
563
564
565
566
567

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
568
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
569
570
571
572

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

573
574
575
576
577
578
579
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
580
581
582

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
583

584
    return ModelRequestData(
585
        engine_args=engine_args,
586
587
588
589
590
591
592
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


593
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
594
595
596
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
597
598
599
600
601
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
602
603
604
605
        process_vision_info = None

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

606
    # Tested on L40
607
    engine_args = EngineArgs(
608
609
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
610
        max_num_seqs=5,
611
612
613
614
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
615
616
617
618
619
620
621
622
623
624
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
625
626
627

    processor = AutoProcessor.from_pretrained(model_name)

628
629
630
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
631
632
633
634
635
636

    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)

637
    return ModelRequestData(
638
        engine_args=engine_args,
639
640
641
        prompt=prompt,
        image_data=image_data,
    )
642
643


644
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
645
646
647
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
648
649
650
651
652
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
Roger Wang's avatar
Roger Wang committed
653
654
655
656
        process_vision_info = None

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

657
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
658
659
660
661
662
663
664
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
665
666
667
668
669
670
671
672
673
674
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
675
676
677

    processor = AutoProcessor.from_pretrained(model_name)

678
679
680
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
681
682
683
684

    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
685
        image_data, _ = process_vision_info(messages, return_video_kwargs=False)
Roger Wang's avatar
Roger Wang committed
686
687

    return ModelRequestData(
688
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
689
690
691
692
693
        prompt=prompt,
        image_data=image_data,
    )


694
model_example_map = {
695
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
696
    "aya_vision": load_aya_vision,
697
    "deepseek_vl_v2": load_deepseek_vl2,
698
    "gemma3": load_gemma3,
699
    "h2ovl_chat": load_h2ovl,
700
    "idefics3": load_idefics3,
701
    "internvl_chat": load_internvl,
702
    "kimi_vl": load_kimi_vl,
703
    "llama4": load_llama4,
704
    "mistral3": load_mistral3,
705
    "mllama": load_mllama,
706
    "NVLM_D": load_nvlm_d,
707
    "ovis": load_ovis,
708
    "phi3_v": load_phi3v,
709
    "phi4_mm": load_phi4mm,
710
711
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
712
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
713
    "qwen2_5_vl": load_qwen2_5_vl,
714
    "smolvlm": load_smolvlm,
715
716
717
}


718
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
719
    req_data = model_example_map[model](question, image_urls)
720

721
722
723
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

724
725
726
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
727

728
    outputs = llm.generate(
729
        {
730
            "prompt": req_data.prompt,
731
            "multi_modal_data": {"image": req_data.image_data},
732
        },
733
734
735
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
736

737
    print("-" * 50)
738
739
740
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
741
        print("-" * 50)
742
743


744
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
745
    req_data = model_example_map[model](question, image_urls)
746

747
748
749
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
750
751
        req_data.engine_args.limit_mm_per_prompt or {}
    )
752

753
754
755
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

756
757
758
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
759
    outputs = llm.chat(
760
761
762
763
764
765
766
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
767
                    },
768
769
770
771
772
773
774
775
776
777
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
778
        sampling_params=sampling_params,
779
        chat_template=req_data.chat_template,
780
        lora_request=req_data.lora_requests,
781
    )
782

783
    print("-" * 50)
784
785
786
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
787
        print("-" * 50)
788
789


790
def parse_args():
791
    parser = FlexibleArgumentParser(
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
817
818
819
    parser.add_argument(
        "--num-images",
        "-n",
820
        type=int,
821
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
822
        default=2,
823
824
        help="Number of images to use for the demo.",
    )
825
826
    return parser.parse_args()

827

828
829
830
831
832
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

833
    image_urls = IMAGE_URLS[: args.num_images]
834
835
836
837
838
839
840
841
842
843
844

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
845
    main(args)