vision_language_multi_image.py 37.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
5
6
multi-image input on vision language models for text generation,
using the chat template defined by the model.
7
"""
8

9
import os
10
from argparse import Namespace
11
from dataclasses import asdict
12
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
from PIL.Image import Image
16
from transformers import AutoProcessor, AutoTokenizer
17

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.lora.request import LoRARequest
20
21
22
23
24
25
26
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
27
28
29
30
31
32
33
34
35
36
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
37
38
39
]


40
class ModelRequestData(NamedTuple):
41
    engine_args: EngineArgs
42
    prompt: str
43
    image_data: list[Image]
44
45
46
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
47
48


49
50
51
52
53
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


54
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
55
    model_name = "rhymes-ai/Aria"
56
57
58
59
60
61
62
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
63
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
64
65
66
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
67
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
68

69
    return ModelRequestData(
70
        engine_args=engine_args,
71
72
73
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
74
    )
75

76

Jennifer Zhao's avatar
Jennifer Zhao committed
77
78
79
80
81
82
83
84
85
86
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
87
88
89
90
91
92
93
94
95
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
96
97
98

    processor = AutoProcessor.from_pretrained(model_name)

99
100
101
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
102
103
104
105
106
107
108
109

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


110
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
111
    model_name = "deepseek-ai/deepseek-vl2-tiny"
112

113
114
115
116
117
118
119
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
120

121
122
123
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
124
125
126
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
127
        engine_args=engine_args,
128
129
130
131
132
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


133
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
134
135
    model_name = "google/gemma-3-4b-it"

136
    engine_args = EngineArgs(
137
138
139
140
141
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
142
143

    placeholders = [{"type": "image", "image": url} for url in image_urls]
144
145
146
147
148
149
150
151
152
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
153
154
155

    processor = AutoProcessor.from_pretrained(model_name)

156
157
158
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
159
160

    return ModelRequestData(
161
        engine_args=engine_args,
162
163
164
165
166
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


167
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
168
    model_name = "h2oai/h2ovl-mississippi-800m"
169

170
    engine_args = EngineArgs(
171
172
173
174
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
175
        mm_processor_kwargs={"max_dynamic_patch": 4},
176
177
    )

178
179
180
181
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
182

183
184
185
186
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
187
188

    # Stop tokens for H2OVL-Mississippi
189
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
190
191
192
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
193
        engine_args=engine_args,
194
195
196
197
198
199
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


200
201
202
203
204
def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
205

206
    engine_args = EngineArgs(
207
        model=model_name,
208
209
        trust_remote_code=True,
        max_model_len=16384,
210
211
212
        limit_mm_per_prompt={"image": len(image_urls)},
    )

213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
    message = {"role": "user", "content": list()}
    for _image_url in image_urls:
        message["content"].append(
            {
                "type": "image",
                "image": _image_url,
                "ocr": "",
                "lens_keywords": "",
                "lens_local_keywords": "",
            }
        )
    message["content"].append(
        {
            "type": "text",
            "text": question,
        }
229
    )
230
231
232
233
234
235
236
237
238

    prompt = tokenizer.apply_chat_template(
        [
            message,
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

239
    return ModelRequestData(
240
        engine_args=engine_args,
241
        prompt=prompt,
242
        stop_token_ids=None,
243
244
245
246
        image_data=[fetch_image(url) for url in image_urls],
    )


247
248
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
249
250
251
252
253
254
255
256

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
257
258
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
259
        mm_processor_kwargs={
260
            "size": {"longest_edge": 2 * 364},
261
262
263
        },
    )

264
265
266
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
267
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
268
269
270
271
272
273
274
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


Lyu Han's avatar
Lyu Han committed
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "internlm/Intern-S1"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "\n".join(
        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


302
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
303
304
    model_name = "OpenGVLab/InternVL2-2B"

305
    engine_args = EngineArgs(
306
307
308
309
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
310
        mm_processor_kwargs={"max_dynamic_patch": 4},
311
312
    )

313
314
315
316
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
317

318
319
320
321
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
322
323
324
325

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
326
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
327
328
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
329

330
    return ModelRequestData(
331
        engine_args=engine_args,
332
333
334
335
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
336
337


338
339
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
340

341
342
    engine_args = EngineArgs(
        model=model_name,
343
344
        max_model_len=131072,
        tensor_parallel_size=8,
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


372
373
374
375
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


406
407
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
408
409
    engine_args = EngineArgs(
        model=model_name,
410
        max_model_len=8192,
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


439
440
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
441
442
    engine_args = EngineArgs(
        model=model_name,
443
444
        max_model_len=16384,
        max_num_seqs=16,
445
446
447
448
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
449
450
451
452
453
454
455
456
457
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
458
459
460

    processor = AutoProcessor.from_pretrained(model_name)

461
462
463
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
464
465
466
467
468
469
470
471

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


509
510
511
512
513
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
514
        trust_remote_code=True,
515
516
517
518
519
520
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
521
522
523
524
525
526
527
528
529
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
530

531
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
532

533
534
535
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
536
537
538
539
540
541
542
543

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


544
545
546
547
548
549
550
551
552
553
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
554
        ignore_patterns=["consolidated.safetensors"],
555
556
557
558
559
560
561
562
563
564
565
566
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


567
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
568
569
570
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
571
    engine_args = EngineArgs(
572
        model=model_name,
573
574
        max_model_len=8192,
        max_num_seqs=2,
575
576
577
        limit_mm_per_prompt={"image": len(image_urls)},
    )

578
579
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
580
    return ModelRequestData(
581
        engine_args=engine_args,
582
583
584
585
586
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


587
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
588
589
590
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
591
    engine_args = EngineArgs(
592
593
594
595
596
597
598
599
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

600
601
602
603
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
604

605
606
607
608
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
609
610

    return ModelRequestData(
611
        engine_args=engine_args,
612
613
614
615
616
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


617
618
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
619
620
621
622
623
624
625
626
627
628
629
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

630
631
632
633
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
634

635
636
637
638
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
639
640
641
642
643
644
645
646

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


647
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
648
649
650
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
651
    engine_args = EngineArgs(
652
653
654
655
656
657
658
659
660
661
662
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
663
        engine_args=engine_args,
664
665
666
667
668
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


669
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
670
671
672
673
674
675
676
677
678
679
680
681
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
682
    engine_args = EngineArgs(
683
684
685
686
687
688
689
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
690
691
692
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
693
694
695
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
696
        engine_args=engine_args,
697
698
699
700
701
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


702
703
704
705
706
707
708
709
710
711
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
712
    engine_args = EngineArgs(
713
714
        model=model_path,
        trust_remote_code=True,
715
        max_model_len=4096,
716
717
718
719
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
720
721
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
722
723
    )

724
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
725
726
727
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
728
        engine_args=engine_args,
729
730
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
731
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
732
733
734
    )


735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download(
        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
    )
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    engine_args = EngineArgs(
        model=model_path,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
    )

    placeholders = "<|image|>" * len(image_urls)
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )


769
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
770
    model_name = "Qwen/Qwen-VL-Chat"
771
    engine_args = EngineArgs(
772
773
774
775
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
776
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
777
778
        limit_mm_per_prompt={"image": len(image_urls)},
    )
779
780
781
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
782
783
784
785
786

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
787
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
788
789
790
791

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

792
793
794
795
796
797
798
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
799
800
801

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
802

803
    return ModelRequestData(
804
        engine_args=engine_args,
805
806
807
808
809
810
811
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


812
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
813
    try:
汪志鹏's avatar
汪志鹏 committed
814
        from qwen_vl_utils import smart_resize
815
    except ModuleNotFoundError:
816
817
818
819
820
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
821
        smart_resize = None
822
823
824

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

825
    # Tested on L40
826
    engine_args = EngineArgs(
827
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
828
        max_model_len=32768 if smart_resize is None else 4096,
829
        max_num_seqs=5,
830
831
832
833
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
834
835
836
837
838
839
840
841
842
843
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
844
845
846

    processor = AutoProcessor.from_pretrained(model_name)

847
848
849
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
850

汪志鹏's avatar
汪志鹏 committed
851
    if smart_resize is None:
852
853
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
854
855
856
857
858
859
860
861
862

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
863

864
    return ModelRequestData(
865
        engine_args=engine_args,
866
867
868
        prompt=prompt,
        image_data=image_data,
    )
869
870


871
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
872
    try:
汪志鹏's avatar
汪志鹏 committed
873
        from qwen_vl_utils import smart_resize
Roger Wang's avatar
Roger Wang committed
874
    except ModuleNotFoundError:
875
876
877
878
879
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
880
        smart_resize = None
Roger Wang's avatar
Roger Wang committed
881
882
883

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

884
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
885
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
886
        max_model_len=32768 if smart_resize is None else 4096,
Roger Wang's avatar
Roger Wang committed
887
888
889
890
891
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
892
893
894
895
896
897
898
899
900
901
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
902
903
904

    processor = AutoProcessor.from_pretrained(model_name)

905
906
907
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
908

汪志鹏's avatar
汪志鹏 committed
909
    if smart_resize is None:
Roger Wang's avatar
Roger Wang committed
910
911
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
912
913
914
915
916
917
918
919
920

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
Roger Wang's avatar
Roger Wang committed
921
922

    return ModelRequestData(
923
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
924
925
926
927
928
        prompt=prompt,
        image_data=image_data,
    )


929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
            "max_image_size": {"longest_edge": 384},
        },
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "stepfun-ai/step3-fp8"

    # NOTE: Below are verified configurations for step3-fp8
    # on 8xH100 GPUs.
    engine_args = EngineArgs(
        model=model_name,
        max_num_batched_tokens=4096,
        gpu_memory_utilization=0.85,
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
        reasoning_parser="step3",
    )

    prompt = (
        "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
        f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
        ">assistant\n<think>\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


汪志鹏's avatar
汪志鹏 committed
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
    )

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
        f"<|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


1031
model_example_map = {
1032
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1033
    "aya_vision": load_aya_vision,
1034
    "deepseek_vl_v2": load_deepseek_vl2,
1035
    "gemma3": load_gemma3,
1036
    "h2ovl_chat": load_h2ovl,
1037
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
1038
    "idefics3": load_idefics3,
Lyu Han's avatar
Lyu Han committed
1039
    "interns1": load_interns1,
1040
    "internvl_chat": load_internvl,
1041
    "keye_vl": load_keye_vl,
1042
    "kimi_vl": load_kimi_vl,
1043
    "llama4": load_llama4,
1044
1045
1046
    "llava": load_llava,
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
1047
    "mistral3": load_mistral3,
1048
    "mllama": load_mllama,
1049
    "NVLM_D": load_nvlm_d,
1050
    "ovis": load_ovis,
1051
    "phi3_v": load_phi3v,
1052
    "phi4_mm": load_phi4mm,
1053
    "phi4_multimodal": load_phi4_multimodal,
1054
1055
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
1056
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1057
    "qwen2_5_vl": load_qwen2_5_vl,
1058
    "smolvlm": load_smolvlm,
1059
    "step3": load_step3,
汪志鹏's avatar
汪志鹏 committed
1060
    "tarsier": load_tarsier,
1061
    "tarsier2": load_tarsier2,
1062
1063
1064
}


1065
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
1066
    req_data = model_example_map[model](question, image_urls)
1067

1068
1069
1070
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

1071
1072
1073
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
1074

1075
    outputs = llm.generate(
1076
        {
1077
            "prompt": req_data.prompt,
1078
            "multi_modal_data": {"image": req_data.image_data},
1079
        },
1080
1081
1082
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
1083

1084
    print("-" * 50)
1085
1086
1087
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1088
        print("-" * 50)
1089
1090


1091
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
1092
    req_data = model_example_map[model](question, image_urls)
1093

1094
1095
1096
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
1097
1098
        req_data.engine_args.limit_mm_per_prompt or {}
    )
1099

1100
1101
1102
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

1103
1104
1105
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
1106
    outputs = llm.chat(
1107
1108
1109
1110
1111
1112
1113
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
1114
                    },
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
1125
        sampling_params=sampling_params,
1126
        chat_template=req_data.chat_template,
1127
        lora_request=req_data.lora_requests,
1128
    )
1129

1130
    print("-" * 50)
1131
1132
1133
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1134
        print("-" * 50)
1135
1136


1137
def parse_args():
1138
    parser = FlexibleArgumentParser(
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
1164
1165
1166
    parser.add_argument(
        "--num-images",
        "-n",
1167
        type=int,
1168
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
1169
        default=2,
1170
1171
        help="Number of images to use for the demo.",
    )
1172
1173
    return parser.parse_args()

1174

1175
1176
1177
1178
1179
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

1180
    image_urls = IMAGE_URLS[: args.num_images]
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
1192
    main(args)