vision_language_multi_image.py 28.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
5
6
multi-image input on vision language models for text generation,
using the chat template defined by the model.
7
"""
8

9
import os
10
from argparse import Namespace
11
from dataclasses import asdict
12
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
from PIL.Image import Image
16
from transformers import AutoProcessor, AutoTokenizer
17

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.lora.request import LoRARequest
20
21
22
23
24
25
26
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
27
28
29
30
31
32
33
34
35
36
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
37
38
39
]


40
class ModelRequestData(NamedTuple):
41
    engine_args: EngineArgs
42
    prompt: str
43
    image_data: list[Image]
44
45
46
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
47
48


49
50
51
52
53
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


54
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
55
    model_name = "rhymes-ai/Aria"
56
57
58
59
60
61
62
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
63
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
64
65
66
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
67
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
68

69
    return ModelRequestData(
70
        engine_args=engine_args,
71
72
73
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
74
    )
75

76

Jennifer Zhao's avatar
Jennifer Zhao committed
77
78
79
80
81
82
83
84
85
86
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
87
88
89
90
91
92
93
94
95
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
96
97
98

    processor = AutoProcessor.from_pretrained(model_name)

99
100
101
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
102
103
104
105
106
107
108
109

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


110
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
111
    model_name = "deepseek-ai/deepseek-vl2-tiny"
112

113
114
115
116
117
118
119
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
120

121
122
123
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
124
125
126
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
127
        engine_args=engine_args,
128
129
130
131
132
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


133
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
134
135
    model_name = "google/gemma-3-4b-it"

136
    engine_args = EngineArgs(
137
138
139
140
141
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
142
143

    placeholders = [{"type": "image", "image": url} for url in image_urls]
144
145
146
147
148
149
150
151
152
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
153
154
155

    processor = AutoProcessor.from_pretrained(model_name)

156
157
158
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
159
160

    return ModelRequestData(
161
        engine_args=engine_args,
162
163
164
165
166
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


167
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
168
    model_name = "h2oai/h2ovl-mississippi-800m"
169

170
    engine_args = EngineArgs(
171
172
173
174
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
175
        mm_processor_kwargs={"max_dynamic_patch": 4},
176
177
    )

178
179
180
181
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
182

183
184
185
186
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
187
188

    # Stop tokens for H2OVL-Mississippi
189
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
190
191
192
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
193
        engine_args=engine_args,
194
195
196
197
198
199
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


200
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
201
202
203
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
204
    engine_args = EngineArgs(
205
206
207
208
209
210
211
212
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
213
            "size": {"longest_edge": 2 * 364},
214
215
216
        },
    )

217
218
219
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
220
221
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
222
        engine_args=engine_args,
223
224
225
226
227
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


228
229
230
231
232
233
234
235
236
237
238
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
239
            "max_image_size": {"longest_edge": 384},
240
241
242
        },
    )

243
244
245
246
247
248
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
249
250
251
252
253
254
255
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


256
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
257
258
    model_name = "OpenGVLab/InternVL2-2B"

259
    engine_args = EngineArgs(
260
261
262
263
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
264
        mm_processor_kwargs={"max_dynamic_patch": 4},
265
266
    )

267
268
269
270
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
271

272
273
274
275
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
276
277
278
279

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
280
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
281
282
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
283

284
    return ModelRequestData(
285
        engine_args=engine_args,
286
287
288
289
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
290
291


292
293
294
295
296
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
297
        max_model_len=131072,
298
299
300
301
302
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
303
304
305
306
307
308
309
310
311
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
312
313
314

    processor = AutoProcessor.from_pretrained(model_name)

315
316
317
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
318
319
320
321
322
323
324
325

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


326
327
328
329
330
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
331
        trust_remote_code=True,
332
333
334
335
336
337
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
338
339
340
341
342
343
344
345
346
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
347

348
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
349

350
351
352
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
353
354
355
356
357
358
359
360

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


383
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
384
385
386
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
387
    engine_args = EngineArgs(
388
        model=model_name,
389
390
        max_model_len=8192,
        max_num_seqs=2,
391
392
393
        limit_mm_per_prompt={"image": len(image_urls)},
    )

394
395
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
396
    return ModelRequestData(
397
        engine_args=engine_args,
398
399
400
401
402
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


403
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
404
405
406
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
407
    engine_args = EngineArgs(
408
409
410
411
412
413
414
415
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

416
417
418
419
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
420

421
422
423
424
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
425
426

    return ModelRequestData(
427
        engine_args=engine_args,
428
429
430
431
432
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


433
434
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
435
436
437
438
439
440
441
442
443
444
445
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

446
447
448
449
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
450

451
452
453
454
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
455
456
457
458
459
460
461
462

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


463
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
464
465
466
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
467
    engine_args = EngineArgs(
468
469
470
471
472
473
474
475
476
477
478
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
479
        engine_args=engine_args,
480
481
482
483
484
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


485
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
486
487
488
489
490
491
492
493
494
495
496
497
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
498
    engine_args = EngineArgs(
499
500
501
502
503
504
505
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
506
507
508
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
509
510
511
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
512
        engine_args=engine_args,
513
514
515
516
517
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


518
519
520
521
522
523
524
525
526
527
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
528
    engine_args = EngineArgs(
529
530
        model=model_path,
        trust_remote_code=True,
531
        max_model_len=4096,
532
533
534
535
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
536
537
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
538
539
    )

540
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
541
542
543
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
544
        engine_args=engine_args,
545
546
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
547
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
548
549
550
    )


551
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
552
    model_name = "Qwen/Qwen-VL-Chat"
553
    engine_args = EngineArgs(
554
555
556
557
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
558
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
559
560
        limit_mm_per_prompt={"image": len(image_urls)},
    )
561
562
563
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
564
565
566
567
568

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
569
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
570
571
572
573

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

574
575
576
577
578
579
580
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
581
582
583

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
584

585
    return ModelRequestData(
586
        engine_args=engine_args,
587
588
589
590
591
592
593
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


594
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
595
    try:
汪志鹏's avatar
汪志鹏 committed
596
        from qwen_vl_utils import smart_resize
597
    except ModuleNotFoundError:
598
599
600
601
602
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
603
        smart_resize = None
604
605
606

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

607
    # Tested on L40
608
    engine_args = EngineArgs(
609
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
610
        max_model_len=32768 if smart_resize is None else 4096,
611
        max_num_seqs=5,
612
613
614
615
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
616
617
618
619
620
621
622
623
624
625
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
626
627
628

    processor = AutoProcessor.from_pretrained(model_name)

629
630
631
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
632

汪志鹏's avatar
汪志鹏 committed
633
    if smart_resize is None:
634
635
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
636
637
638
639
640
641
642
643
644

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
645

646
    return ModelRequestData(
647
        engine_args=engine_args,
648
649
650
        prompt=prompt,
        image_data=image_data,
    )
651
652


653
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
654
    try:
汪志鹏's avatar
汪志鹏 committed
655
        from qwen_vl_utils import smart_resize
Roger Wang's avatar
Roger Wang committed
656
    except ModuleNotFoundError:
657
658
659
660
661
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
662
        smart_resize = None
Roger Wang's avatar
Roger Wang committed
663
664
665

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

666
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
667
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
668
        max_model_len=32768 if smart_resize is None else 4096,
Roger Wang's avatar
Roger Wang committed
669
670
671
672
673
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
674
675
676
677
678
679
680
681
682
683
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
684
685
686

    processor = AutoProcessor.from_pretrained(model_name)

687
688
689
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
690

汪志鹏's avatar
汪志鹏 committed
691
    if smart_resize is None:
Roger Wang's avatar
Roger Wang committed
692
693
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
694
695
696
697
698
699
700
701
702

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
Roger Wang's avatar
Roger Wang committed
703
704

    return ModelRequestData(
705
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
706
707
708
709
710
        prompt=prompt,
        image_data=image_data,
    )


汪志鹏's avatar
汪志鹏 committed
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


731
model_example_map = {
732
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
733
    "aya_vision": load_aya_vision,
734
    "deepseek_vl_v2": load_deepseek_vl2,
735
    "gemma3": load_gemma3,
736
    "h2ovl_chat": load_h2ovl,
737
    "idefics3": load_idefics3,
738
    "internvl_chat": load_internvl,
739
    "kimi_vl": load_kimi_vl,
740
    "llama4": load_llama4,
741
    "mistral3": load_mistral3,
742
    "mllama": load_mllama,
743
    "NVLM_D": load_nvlm_d,
744
    "ovis": load_ovis,
745
    "phi3_v": load_phi3v,
746
    "phi4_mm": load_phi4mm,
747
748
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
749
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
750
    "qwen2_5_vl": load_qwen2_5_vl,
751
    "smolvlm": load_smolvlm,
汪志鹏's avatar
汪志鹏 committed
752
    "tarsier": load_tarsier,
753
754
755
}


756
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
757
    req_data = model_example_map[model](question, image_urls)
758

759
760
761
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

762
763
764
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
765

766
    outputs = llm.generate(
767
        {
768
            "prompt": req_data.prompt,
769
            "multi_modal_data": {"image": req_data.image_data},
770
        },
771
772
773
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
774

775
    print("-" * 50)
776
777
778
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
779
        print("-" * 50)
780
781


782
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
783
    req_data = model_example_map[model](question, image_urls)
784

785
786
787
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
788
789
        req_data.engine_args.limit_mm_per_prompt or {}
    )
790

791
792
793
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

794
795
796
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
797
    outputs = llm.chat(
798
799
800
801
802
803
804
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
805
                    },
806
807
808
809
810
811
812
813
814
815
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
816
        sampling_params=sampling_params,
817
        chat_template=req_data.chat_template,
818
        lora_request=req_data.lora_requests,
819
    )
820

821
    print("-" * 50)
822
823
824
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
825
        print("-" * 50)
826
827


828
def parse_args():
829
    parser = FlexibleArgumentParser(
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
855
856
857
    parser.add_argument(
        "--num-images",
        "-n",
858
        type=int,
859
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
860
        default=2,
861
862
        help="Number of images to use for the demo.",
    )
863
864
    return parser.parse_args()

865

866
867
868
869
870
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

871
    image_urls = IMAGE_URLS[: args.num_images]
872
873
874
875
876
877
878
879
880
881
882

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
883
    main(args)