vision_language.py 37.8 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
from vllm.multimodal.image import convert_image_mode
23
24
from vllm.utils import FlexibleArgumentParser

25
26
27
28
29
30
31
32

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


33
34
35
36
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

37

38
# Aria
39
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
40
41
42
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

43
    # NOTE: Need L40 (or equivalent) to avoid OOM
44
45
46
47
48
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
49
        limit_mm_per_prompt={modality: 1},
50
    )
51

52
53
54
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
55
56

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
57
58
59
60
61
62

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
63
64


Jennifer Zhao's avatar
Jennifer Zhao committed
65
66
67
68
69
70
71
72
73
74
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
75
        limit_mm_per_prompt={modality: 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
76
77
78
79
80
81
82
83
84
85
86
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


87
# BLIP-2
88
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
89
90
91
92
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
93
    prompts = [f"Question: {question} Answer:" for question in questions]
94
    engine_args = EngineArgs(
95
        model="Salesforce/blip2-opt-6.7b",
96
        limit_mm_per_prompt={modality: 1},
97
98
99
100
101
102
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
103
104
105


# Chameleon
106
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
107
108
    assert modality == "image"

109
    prompts = [f"{question}<image>" for question in questions]
110
111
112
113
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
114
        limit_mm_per_prompt={modality: 1},
115
116
117
118
119
120
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
121
122


123
# Deepseek-VL2
124
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
125
126
    assert modality == "image"

127
    model_name = "deepseek-ai/deepseek-vl2-tiny"
128

129
130
131
132
133
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
134
        limit_mm_per_prompt={modality: 1},
135
    )
136

137
138
139
140
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
141
142
143
144
145

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
146
147


148
# Florence2
149
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
150
151
    assert modality == "image"

152
153
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
154
        tokenizer="Isotr0py/Florence-2-tokenizer",
155
156
        max_model_len=4096,
        max_num_seqs=2,
157
158
        trust_remote_code=True,
        dtype="bfloat16",
159
        limit_mm_per_prompt={modality: 1},
160
    )
161

162
163
164
165
166
167
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
168
169


170
# Fuyu
171
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
172
173
    assert modality == "image"

174
    prompts = [f"{question}\n" for question in questions]
175
176
177
178
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
179
        limit_mm_per_prompt={modality: 1},
180
181
182
183
184
185
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
186
187


188
# Gemma 3
189
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
190
191
192
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

193
    engine_args = EngineArgs(
194
195
196
197
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
198
        limit_mm_per_prompt={modality: 1},
199
    )
200
201
202
203

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
204
205
206
207
208

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
209
210


211
# GLM-4v
212
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
213
214
215
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

216
217
218
219
220
221
222
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
223
        limit_mm_per_prompt={modality: 1},
224
    )
225

226
227
228
229
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
230

231
    stop_token_ids = [151329, 151336, 151338]
232
233
234
235
236
237

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
238
239
240


# H2OVL-Mississippi
241
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
242
243
    assert modality == "image"

244
    model_name = "h2oai/h2ovl-mississippi-800m"
245

246
    engine_args = EngineArgs(
247
248
249
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
250
        limit_mm_per_prompt={modality: 1},
251
252
253
254
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
255
256
257
258
259
260
261
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
262
263

    # Stop tokens for H2OVL-Mississippi
264
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
265
    stop_token_ids = [tokenizer.eos_token_id]
266
267
268
269
270
271

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
272
273
274


# Idefics3-8B-Llama3
275
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
276
277
278
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

279
    engine_args = EngineArgs(
280
281
282
283
284
285
286
287
288
289
290
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
291
        limit_mm_per_prompt={modality: 1},
292
    )
293
    prompts = [(
294
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
295
    ) for question in questions]
296
297
298
299
300

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
301
302


303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
318
        limit_mm_per_prompt={modality: 1},
319
320
321
322
323
324
325
326
327
328
329
330
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


331
# InternVL
332
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
333

334
    model_name = "OpenGVLab/InternVL3-2B"
335

336
    engine_args = EngineArgs(
337
338
        model=model_name,
        trust_remote_code=True,
339
        max_model_len=8192,
340
        limit_mm_per_prompt={modality: 1},
341
342
    )

343
344
345
346
347
    if modality == "image":
        placeholder = "<image>"
    elif modality == "video":
        placeholder = "<video>"

348
349
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
350
351
    messages = [[{
        'role': 'user',
352
        'content': f"{placeholder}\n{question}"
353
354
355
356
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
357
358
359
360
361
362
363

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
364
365
366
    stop_token_ids = [
        token_id for token_id in stop_token_ids if token_id is not None
    ]
367
368
369
370
371
372

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
373
374


375
376
377
378
379
380
381
382
383
384
385
386
387
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
Cyrus Leung's avatar
Cyrus Leung committed
388
        max_model_len=4096,
389
        limit_mm_per_prompt={modality: 1},
390
391
392
393
394
395
396
397
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


398
# LLaVA-1.5
399
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
400
    assert modality == "image"
401

402
403
404
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
405

406
407
408
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
409
        limit_mm_per_prompt={modality: 1},
410
411
412
413
414
415
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
416
417
418


# LLaVA-1.6/LLaVA-NeXT
419
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
420
    assert modality == "image"
421

422
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
423
424
425
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
426
        limit_mm_per_prompt={modality: 1},
427
428
429
430
431
432
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
433
434
435
436


# LlaVA-NeXT-Video
# Currently only support for video input
437
438
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
439
440
    assert modality == "video"

441
442
443
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
444
445
446
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
447
        max_num_seqs=2,
448
        limit_mm_per_prompt={modality: 1},
449
450
451
452
453
454
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
455
456


457
# LLaVA-OneVision
458
459
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
460
461

    if modality == "video":
462
463
464
465
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
466
467

    elif modality == "image":
468
469
470
471
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
472

473
474
475
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
476
        limit_mm_per_prompt={modality: 1},
477
478
479
480
481
482
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
483
484


485
# Mantis
486
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
487
    assert modality == "image"
488

489
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
490
491
492
493
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
494

495
    engine_args = EngineArgs(
496
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
497
        max_model_len=4096,
498
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
499
        limit_mm_per_prompt={modality: 1},
500
    )
501
    stop_token_ids = [128009]
502
503
504
505
506
507

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
508
509
510


# MiniCPM-V
511
def run_minicpmv_base(questions: list[str], modality: str, model_name):
512
513
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
514
515
516
517
518
519
520

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
521
522
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

523
    # 2.6
524
525
526
527
528
529
530
531
532
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
533
534
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
535
    engine_args = EngineArgs(
536
        model=model_name,
537
538
        max_model_len=4096,
        max_num_seqs=2,
539
        trust_remote_code=True,
540
        limit_mm_per_prompt={modality: 1},
541
    )
542
543
544
545
546
547
548
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

549
    # 2.6 / o2.6
550
551
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
552

553
554
555
556
557
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

558
559
560
561
562
563
564
565
566
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
567
568
569
570
571
572

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
573
574


575
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
576
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
577
578


579
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
580
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
581
582


583
584
585
586
587
588
589
590
591
592
593
594
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
595
        limit_mm_per_prompt={modality: 1},
596
597
598
599
600
601
602
603
604
605
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


606
# LLama 3.2
607
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
608
609
    assert modality == "image"

610
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
611

612
613
614
615
616
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
617
    engine_args = EngineArgs(
618
        model=model_name,
619
        max_model_len=8192,
620
        max_num_seqs=2,
621
        limit_mm_per_prompt={modality: 1},
622
623
    )

624
    tokenizer = AutoTokenizer.from_pretrained(model_name)
625
    messages = [[{
626
627
628
629
630
631
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
632
            "text": question
633
        }]
634
    }] for question in questions]
635
636
637
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
638
639
640
641
642

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
643
644


645
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
646
647
648
649
650
651
652
653
654
655
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
656
        limit_mm_per_prompt={modality: 1},
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


681
# Molmo
682
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
683
684
    assert modality == "image"

685
    model_name = "allenai/Molmo-7B-D-0924"
686

687
    engine_args = EngineArgs(
688
        model=model_name,
689
        trust_remote_code=True,
690
        dtype="bfloat16",
691
        limit_mm_per_prompt={modality: 1},
692
    )
693

694
695
696
697
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
698
699
700
701
702

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
703
704


705
# NVLM-D
706
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
707
708
709
710
711
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
712
    engine_args = EngineArgs(
713
714
715
716
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
717
        limit_mm_per_prompt={modality: 1},
718
719
720
721
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
722
    messages = [[{
723
724
        'role': 'user',
        'content': f"<image>\n{question}"
725
    }] for question in questions]
726
727
728
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
729
730
731
732
733

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
734
735


736
737
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
738
739
740
741
742
743
744
745
746
747
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
748
        limit_mm_per_prompt={modality: 1},
749
750
    )

751
752
753
754
755
756
757
758
759
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
760
761
762
763
764
765
766

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


767
# PaliGemma
768
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
769
    assert modality == "image"
770

771
    # PaliGemma has special prompt format for VQA
772
773
774
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
775
        limit_mm_per_prompt={modality: 1},
776
    )
777
778
779
780
781

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
782
783


784
# PaliGemma 2
785
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
786
    assert modality == "image"
787

788
    # PaliGemma 2 has special prompt format for VQA
789
790
791
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
792
        limit_mm_per_prompt={modality: 1},
793
    )
794
795
796
797
798

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
799
800


801
# Phi-3-Vision
802
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
803
804
    assert modality == "image"

805
806
807
808
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
809

810
811
812
813
814
815
816
817
818
819
820
821
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
822
    engine_args = EngineArgs(
823
824
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
825
        max_model_len=4096,
826
        max_num_seqs=2,
827
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
828
        mm_processor_kwargs={"num_crops": 16},
829
        limit_mm_per_prompt={modality: 1},
830
    )
831
832
833
834
835

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
836
837


838
# Phi-4-multimodal-instruct
839
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
840
841
842
843
844
845
846
847
848
849
850
851
852
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
853
    engine_args = EngineArgs(
854
855
        model=model_path,
        trust_remote_code=True,
856
        max_model_len=5120,
857
        max_num_seqs=2,
858
        max_num_batched_tokens=12800,
859
860
        enable_lora=True,
        max_lora_rank=320,
861
862
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
863
        limit_mm_per_prompt={modality: 1},
864
865
    )

866
867
868
869
870
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
871
872


873
# Pixtral HF-format
874
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
875
876
877
878
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

879
    # NOTE: Need L40 (or equivalent) to avoid OOM
880
    engine_args = EngineArgs(
881
        model=model_name,
882
        max_model_len=6144,
883
        max_num_seqs=2,
884
        limit_mm_per_prompt={modality: 1},
885
886
    )

887
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
888
889
890
891
892

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
893
894


895
# Qwen
896
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
897
898
    assert modality == "image"

899
    engine_args = EngineArgs(
900
        model="Qwen/Qwen-VL",
901
        trust_remote_code=True,
902
903
        max_model_len=1024,
        max_num_seqs=2,
904
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
905
        limit_mm_per_prompt={modality: 1},
906
907
    )

908
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
909
910
911
912
913

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
914
915


916
# Qwen2-VL
917
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
918

919
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
920

921
    engine_args = EngineArgs(
922
        model=model_name,
923
924
925
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
926
        mm_processor_kwargs={
927
928
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
929
        },
930
        limit_mm_per_prompt={modality: 1},
931
    )
932

933
934
935
936
937
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

938
939
940
941
942
943
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
944
945
946
947
948

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
949
950


Roger Wang's avatar
Roger Wang committed
951
# Qwen2.5-VL
952
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
953
954
955

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

956
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
957
958
959
960
961
962
963
964
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
965
        limit_mm_per_prompt={modality: 1},
Roger Wang's avatar
Roger Wang committed
966
967
968
969
970
971
972
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

973
974
975
976
977
978
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
979
980
981
982
983

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
984
985


986
987
988
989
990
991
992
993
994
995
996
997
998
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
999
        limit_mm_per_prompt={modality: 1},
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")

    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
1032
        limit_mm_per_prompt={modality: 1},
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


1057
model_example_map = {
1058
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1059
    "aya_vision": run_aya_vision,
1060
1061
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
1062
    "deepseek_vl_v2": run_deepseek_vl2,
1063
    "florence2": run_florence2,
1064
    "fuyu": run_fuyu,
1065
    "gemma3": run_gemma3,
1066
1067
1068
1069
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
1070
    "kimi_vl": run_kimi_vl,
1071
1072
    "llava": run_llava,
    "llava-next": run_llava_next,
1073
    "llava-next-video": run_llava_next_video,
1074
    "llava-onevision": run_llava_onevision,
1075
    "mantis": run_mantis,
1076
    "minicpmo": run_minicpmo,
1077
    "minicpmv": run_minicpmv,
1078
    "mistral3": run_mistral3,
1079
    "mllama": run_mllama,
1080
    "llama4": run_llama4,
1081
    "molmo": run_molmo,
1082
    "NVLM_D": run_nvlm_d,
1083
    "ovis": run_ovis,
1084
1085
1086
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1087
    "phi4_mm": run_phi4mm,
1088
    "pixtral_hf": run_pixtral_hf,
1089
    "qwen_vl": run_qwen_vl,
1090
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1091
    "qwen2_5_vl": run_qwen2_5_vl,
1092
    "qwen2_5_omni": run_qwen2_5_omni,
1093
    "skywork_chat": run_skyworkr1v,
1094
    "smolvlm": run_smolvlm,
1095
1096
1097
}


1098
1099
1100
1101
1102
1103
1104
1105
1106
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
1107
1108
        image = convert_image_mode(
            ImageAsset("cherry_blossom").pil_image, "RGB")
1109
1110
1111
1112
1113
1114
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1115
1116
1117

        return {
            "data": image,
1118
            "questions": img_questions,
1119
1120
1121
1122
        }

    if args.modality == "video":
        # Input video and question
1123
        video = VideoAsset(name="baby_reading",
1124
                           num_frames=args.num_frames).np_ndarrays
1125
        vid_questions = ["Why is this video funny?"]
1126
1127
1128

        return {
            "data": video,
1129
            "questions": vid_questions,
1130
1131
1132
1133
1134
1135
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1136
1137
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1157
            "prompt": prompts[i % len(prompts)],
1158
1159
1160
1161
1162
1163
1164
1165
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1233
1234
1235
1236
1237
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1238
1239
1240
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1241
    questions = mm_input["questions"]
1242

1243
1244
    req_data = model_example_map[model](questions, modality)

1245
1246
1247
1248
1249
1250
1251
1252
1253
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1254
1255
    llm = LLM(**engine_args)

1256
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1257
1258
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1259
    ]
1260
1261
1262

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1263
1264
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1265
                                     stop_token_ids=req_data.stop_token_ids)
1266
1267
1268
1269
1270

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1271
            "prompt": prompts[0],
1272
            "multi_modal_data": {
1273
                modality: data
1274
1275
1276
1277
            },
        }
    else:
        # Batch inference
1278
1279
1280
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1281
                                        args.num_prompts, data, prompts,
1282
1283
1284
1285
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1286
                "prompt": prompts[i % len(prompts)],
1287
1288
1289
                "multi_modal_data": {
                    modality: data
                },
1290
            } for i in range(args.num_prompts)]
1291

1292
1293
1294
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1295

1296
1297
1298
1299
1300
1301
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1302

1303
    print("-" * 50)
1304
1305
1306
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1307
        print("-" * 50)
1308
1309
1310


if __name__ == "__main__":
1311
    args = parse_args()
1312
    main(args)