vision_language_multi_image.py 32 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
5
6
multi-image input on vision language models for text generation,
using the chat template defined by the model.
7
"""
8

9
import os
10
from argparse import Namespace
11
from dataclasses import asdict
12
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
from PIL.Image import Image
16
from transformers import AutoProcessor, AutoTokenizer
17

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.lora.request import LoRARequest
20
21
22
23
24
25
26
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
27
28
29
30
31
32
33
34
35
36
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
37
38
39
]


40
class ModelRequestData(NamedTuple):
41
    engine_args: EngineArgs
42
    prompt: str
43
    image_data: list[Image]
44
45
46
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
47
48


49
50
51
52
53
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


54
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
55
    model_name = "rhymes-ai/Aria"
56
57
58
59
60
61
62
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
63
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
64
65
66
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
67
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
68

69
    return ModelRequestData(
70
        engine_args=engine_args,
71
72
73
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
74
    )
75

76

Jennifer Zhao's avatar
Jennifer Zhao committed
77
78
79
80
81
82
83
84
85
86
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
87
88
89
90
91
92
93
94
95
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
96
97
98

    processor = AutoProcessor.from_pretrained(model_name)

99
100
101
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
102
103
104
105
106
107
108
109

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


110
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
111
    model_name = "deepseek-ai/deepseek-vl2-tiny"
112

113
114
115
116
117
118
119
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
120

121
122
123
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
124
125
126
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
127
        engine_args=engine_args,
128
129
130
131
132
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


133
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
134
135
    model_name = "google/gemma-3-4b-it"

136
    engine_args = EngineArgs(
137
138
139
140
141
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
142
143

    placeholders = [{"type": "image", "image": url} for url in image_urls]
144
145
146
147
148
149
150
151
152
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
153
154
155

    processor = AutoProcessor.from_pretrained(model_name)

156
157
158
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
159
160

    return ModelRequestData(
161
        engine_args=engine_args,
162
163
164
165
166
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


167
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
168
    model_name = "h2oai/h2ovl-mississippi-800m"
169

170
    engine_args = EngineArgs(
171
172
173
174
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
175
        mm_processor_kwargs={"max_dynamic_patch": 4},
176
177
    )

178
179
180
181
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
182

183
184
185
186
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
187
188

    # Stop tokens for H2OVL-Mississippi
189
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
190
191
192
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
193
        engine_args=engine_args,
194
195
196
197
198
199
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


200
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
201
202
203
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
204
    engine_args = EngineArgs(
205
206
207
208
209
210
211
212
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
213
            "size": {"longest_edge": 2 * 364},
214
215
216
        },
    )

217
218
219
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
220
221
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
222
        engine_args=engine_args,
223
224
225
226
227
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


228
229
230
231
232
233
234
235
236
237
238
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
239
            "max_image_size": {"longest_edge": 384},
240
241
242
        },
    )

243
244
245
246
247
248
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
249
250
251
252
253
254
255
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


256
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
257
258
    model_name = "OpenGVLab/InternVL2-2B"

259
    engine_args = EngineArgs(
260
261
262
263
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
264
        mm_processor_kwargs={"max_dynamic_patch": 4},
265
266
    )

267
268
269
270
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
271

272
273
274
275
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
276
277
278
279

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
280
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
281
282
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
283

284
    return ModelRequestData(
285
        engine_args=engine_args,
286
287
288
289
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
290
291


292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


392
393
394
395
396
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
397
        max_model_len=131072,
398
399
400
401
402
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
403
404
405
406
407
408
409
410
411
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
412
413
414

    processor = AutoProcessor.from_pretrained(model_name)

415
416
417
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
418
419
420
421
422
423
424
425

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


426
427
428
429
430
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
431
        trust_remote_code=True,
432
433
434
435
436
437
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
438
439
440
441
442
443
444
445
446
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
447

448
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
449

450
451
452
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
453
454
455
456
457
458
459
460

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


483
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
484
485
486
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
487
    engine_args = EngineArgs(
488
        model=model_name,
489
490
        max_model_len=8192,
        max_num_seqs=2,
491
492
493
        limit_mm_per_prompt={"image": len(image_urls)},
    )

494
495
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
496
    return ModelRequestData(
497
        engine_args=engine_args,
498
499
500
501
502
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


503
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
504
505
506
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
507
    engine_args = EngineArgs(
508
509
510
511
512
513
514
515
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

516
517
518
519
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
520

521
522
523
524
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
525
526

    return ModelRequestData(
527
        engine_args=engine_args,
528
529
530
531
532
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


533
534
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
535
536
537
538
539
540
541
542
543
544
545
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

546
547
548
549
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
550

551
552
553
554
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
555
556
557
558
559
560
561
562

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


563
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
564
565
566
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
567
    engine_args = EngineArgs(
568
569
570
571
572
573
574
575
576
577
578
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
579
        engine_args=engine_args,
580
581
582
583
584
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


585
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
586
587
588
589
590
591
592
593
594
595
596
597
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
598
    engine_args = EngineArgs(
599
600
601
602
603
604
605
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
606
607
608
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
609
610
611
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
612
        engine_args=engine_args,
613
614
615
616
617
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


618
619
620
621
622
623
624
625
626
627
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
628
    engine_args = EngineArgs(
629
630
        model=model_path,
        trust_remote_code=True,
631
        max_model_len=4096,
632
633
634
635
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
636
637
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
638
639
    )

640
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
641
642
643
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
644
        engine_args=engine_args,
645
646
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
647
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
648
649
650
    )


651
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
652
    model_name = "Qwen/Qwen-VL-Chat"
653
    engine_args = EngineArgs(
654
655
656
657
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
658
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
659
660
        limit_mm_per_prompt={"image": len(image_urls)},
    )
661
662
663
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
664
665
666
667
668

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
669
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
670
671
672
673

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

674
675
676
677
678
679
680
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
681
682
683

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
684

685
    return ModelRequestData(
686
        engine_args=engine_args,
687
688
689
690
691
692
693
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


694
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
695
    try:
汪志鹏's avatar
汪志鹏 committed
696
        from qwen_vl_utils import smart_resize
697
    except ModuleNotFoundError:
698
699
700
701
702
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
703
        smart_resize = None
704
705
706

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

707
    # Tested on L40
708
    engine_args = EngineArgs(
709
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
710
        max_model_len=32768 if smart_resize is None else 4096,
711
        max_num_seqs=5,
712
713
714
715
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
716
717
718
719
720
721
722
723
724
725
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
726
727
728

    processor = AutoProcessor.from_pretrained(model_name)

729
730
731
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
732

汪志鹏's avatar
汪志鹏 committed
733
    if smart_resize is None:
734
735
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
736
737
738
739
740
741
742
743
744

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
745

746
    return ModelRequestData(
747
        engine_args=engine_args,
748
749
750
        prompt=prompt,
        image_data=image_data,
    )
751
752


753
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
754
    try:
汪志鹏's avatar
汪志鹏 committed
755
        from qwen_vl_utils import smart_resize
Roger Wang's avatar
Roger Wang committed
756
    except ModuleNotFoundError:
757
758
759
760
761
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
762
        smart_resize = None
Roger Wang's avatar
Roger Wang committed
763
764
765

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

766
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
767
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
768
        max_model_len=32768 if smart_resize is None else 4096,
Roger Wang's avatar
Roger Wang committed
769
770
771
772
773
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
774
775
776
777
778
779
780
781
782
783
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
784
785
786

    processor = AutoProcessor.from_pretrained(model_name)

787
788
789
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
790

汪志鹏's avatar
汪志鹏 committed
791
    if smart_resize is None:
Roger Wang's avatar
Roger Wang committed
792
793
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
794
795
796
797
798
799
800
801
802

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
Roger Wang's avatar
Roger Wang committed
803
804

    return ModelRequestData(
805
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
806
807
808
809
810
        prompt=prompt,
        image_data=image_data,
    )


汪志鹏's avatar
汪志鹏 committed
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
    )

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
        f"<|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


857
model_example_map = {
858
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
859
    "aya_vision": load_aya_vision,
860
    "deepseek_vl_v2": load_deepseek_vl2,
861
    "gemma3": load_gemma3,
862
    "h2ovl_chat": load_h2ovl,
863
    "idefics3": load_idefics3,
864
    "internvl_chat": load_internvl,
865
    "kimi_vl": load_kimi_vl,
866
867
868
    "llava": load_llava,
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
869
    "llama4": load_llama4,
870
    "mistral3": load_mistral3,
871
    "mllama": load_mllama,
872
    "NVLM_D": load_nvlm_d,
873
    "ovis": load_ovis,
874
    "phi3_v": load_phi3v,
875
    "phi4_mm": load_phi4mm,
876
877
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
878
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
879
    "qwen2_5_vl": load_qwen2_5_vl,
880
    "smolvlm": load_smolvlm,
汪志鹏's avatar
汪志鹏 committed
881
    "tarsier": load_tarsier,
882
    "tarsier2": load_tarsier2,
883
884
885
}


886
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
887
    req_data = model_example_map[model](question, image_urls)
888

889
890
891
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

892
893
894
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
895

896
    outputs = llm.generate(
897
        {
898
            "prompt": req_data.prompt,
899
            "multi_modal_data": {"image": req_data.image_data},
900
        },
901
902
903
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
904

905
    print("-" * 50)
906
907
908
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
909
        print("-" * 50)
910
911


912
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
913
    req_data = model_example_map[model](question, image_urls)
914

915
916
917
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
918
919
        req_data.engine_args.limit_mm_per_prompt or {}
    )
920

921
922
923
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

924
925
926
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
927
    outputs = llm.chat(
928
929
930
931
932
933
934
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
935
                    },
936
937
938
939
940
941
942
943
944
945
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
946
        sampling_params=sampling_params,
947
        chat_template=req_data.chat_template,
948
        lora_request=req_data.lora_requests,
949
    )
950

951
    print("-" * 50)
952
953
954
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
955
        print("-" * 50)
956
957


958
def parse_args():
959
    parser = FlexibleArgumentParser(
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
985
986
987
    parser.add_argument(
        "--num-images",
        "-n",
988
        type=int,
989
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
990
        default=2,
991
992
        help="Number of images to use for the demo.",
    )
993
994
    return parser.parse_args()

995

996
997
998
999
1000
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

1001
    image_urls = IMAGE_URLS[: args.num_images]
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
1013
    main(args)