vision_language.py 37.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
23
from vllm.utils import FlexibleArgumentParser

24
25
26
27
28
29
30
31

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


32
33
34
35
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

36

37
# Aria
38
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
39
40
41
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

42
    # NOTE: Need L40 (or equivalent) to avoid OOM
43
44
45
46
47
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
48
        limit_mm_per_prompt={"image": 1},
49
    )
50

51
52
53
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
54
55

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
56
57
58
59
60
61

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
62
63


Jennifer Zhao's avatar
Jennifer Zhao committed
64
65
66
67
68
69
70
71
72
73
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
74
        limit_mm_per_prompt={"image": 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
75
76
77
78
79
80
81
82
83
84
85
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


86
# BLIP-2
87
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
88
89
90
91
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
92
    prompts = [f"Question: {question} Answer:" for question in questions]
93
    engine_args = EngineArgs(
94
        model="Salesforce/blip2-opt-6.7b",
95
        limit_mm_per_prompt={"image": 1},
96
97
98
99
100
101
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
102
103
104


# Chameleon
105
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
106
107
    assert modality == "image"

108
    prompts = [f"{question}<image>" for question in questions]
109
110
111
112
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
113
        limit_mm_per_prompt={"image": 1},
114
115
116
117
118
119
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
120
121


122
# Deepseek-VL2
123
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
124
125
    assert modality == "image"

126
    model_name = "deepseek-ai/deepseek-vl2-tiny"
127

128
129
130
131
132
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133
        limit_mm_per_prompt={"image": 1},
134
    )
135

136
137
138
139
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
140
141
142
143
144

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
145
146


147
# Florence2
148
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
149
150
    assert modality == "image"

151
152
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
153
        tokenizer="Isotr0py/Florence-2-tokenizer",
154
155
        max_model_len=4096,
        max_num_seqs=2,
156
157
        trust_remote_code=True,
        dtype="bfloat16",
158
        limit_mm_per_prompt={"image": 1},
159
    )
160

161
162
163
164
165
166
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
167
168


169
# Fuyu
170
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
171
172
    assert modality == "image"

173
    prompts = [f"{question}\n" for question in questions]
174
175
176
177
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
178
        limit_mm_per_prompt={"image": 1},
179
180
181
182
183
184
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# Gemma 3
188
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

192
    engine_args = EngineArgs(
193
194
195
196
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
197
        limit_mm_per_prompt={"image": 1},
198
    )
199
200
201
202

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
203
204
205
206
207

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
208
209


210
# GLM-4v
211
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
212
213
214
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

215
216
217
218
219
220
221
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222
        limit_mm_per_prompt={"image": 1},
223
    )
224

225
226
227
228
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
229

230
    stop_token_ids = [151329, 151336, 151338]
231
232
233
234
235
236

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
237
238
239


# H2OVL-Mississippi
240
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
241
242
    assert modality == "image"

243
    model_name = "h2oai/h2ovl-mississippi-800m"
244

245
    engine_args = EngineArgs(
246
247
248
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
249
        limit_mm_per_prompt={"image": 1},
250
251
252
253
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
254
255
256
257
258
259
260
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
261
262

    # Stop tokens for H2OVL-Mississippi
263
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
264
    stop_token_ids = [tokenizer.eos_token_id]
265
266
267
268
269
270

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
271
272
273


# Idefics3-8B-Llama3
274
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
275
276
277
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

278
    engine_args = EngineArgs(
279
280
281
282
283
284
285
286
287
288
289
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
290
        limit_mm_per_prompt={"image": 1},
291
    )
292
    prompts = [(
293
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
294
    ) for question in questions]
295
296
297
298
299

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
317
        limit_mm_per_prompt={"image": 1},
318
319
320
321
322
323
324
325
326
327
328
329
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


330
# InternVL
331
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
332
333
334
335
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

336
    engine_args = EngineArgs(
337
338
339
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
340
        limit_mm_per_prompt={"image": 1},
341
342
343
344
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
345
346
347
348
349
350
351
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
352
353
354
355
356
357
358

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
359
360
361
362
363
364

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
365
366


367
368
369
370
371
372
373
374
375
376
377
378
379
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
Cyrus Leung's avatar
Cyrus Leung committed
380
381
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
382
383
384
385
386
387
388
389
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


390
# LLaVA-1.5
391
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
392
    assert modality == "image"
393

394
395
396
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
397

398
399
400
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
401
        limit_mm_per_prompt={"image": 1},
402
403
404
405
406
407
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
408
409
410


# LLaVA-1.6/LLaVA-NeXT
411
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
412
    assert modality == "image"
413

414
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
415
416
417
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
418
        limit_mm_per_prompt={"image": 1},
419
420
421
422
423
424
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
425
426
427
428


# LlaVA-NeXT-Video
# Currently only support for video input
429
430
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
431
432
    assert modality == "video"

433
434
435
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
436
437
438
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
439
        max_num_seqs=2,
440
        limit_mm_per_prompt={"image": 1},
441
442
443
444
445
446
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
447
448


449
# LLaVA-OneVision
450
451
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
452
453

    if modality == "video":
454
455
456
457
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
458
459

    elif modality == "image":
460
461
462
463
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
464

465
466
467
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
468
        limit_mm_per_prompt={"image": 1},
469
470
471
472
473
474
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
475
476


477
# Mantis
478
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
479
    assert modality == "image"
480

481
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
482
483
484
485
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
486

487
    engine_args = EngineArgs(
488
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
489
        max_model_len=4096,
490
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
491
        limit_mm_per_prompt={"image": 1},
492
    )
493
    stop_token_ids = [128009]
494
495
496
497
498
499

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
500
501
502


# MiniCPM-V
503
def run_minicpmv_base(questions: list[str], modality: str, model_name):
504
505
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
506
507
508
509
510
511
512

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
513
514
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

515
    # 2.6
516
517
518
519
520
521
522
523
524
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
525
526
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
527
    engine_args = EngineArgs(
528
        model=model_name,
529
530
        max_model_len=4096,
        max_num_seqs=2,
531
        trust_remote_code=True,
532
        limit_mm_per_prompt={"image": 1},
533
    )
534
535
536
537
538
539
540
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

541
    # 2.6 / o2.6
542
543
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
544

545
546
547
548
549
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

550
551
552
553
554
555
556
557
558
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
559
560
561
562
563
564

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
565
566


567
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
568
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
569
570


571
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
572
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
573
574


575
576
577
578
579
580
581
582
583
584
585
586
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
587
        limit_mm_per_prompt={"image": 1},
588
589
590
591
592
593
594
595
596
597
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


598
# LLama 3.2
599
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
600
601
    assert modality == "image"

602
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
603

604
605
606
607
608
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
609
    engine_args = EngineArgs(
610
        model=model_name,
611
        max_model_len=8192,
612
        max_num_seqs=2,
613
        limit_mm_per_prompt={"image": 1},
614
615
    )

616
    tokenizer = AutoTokenizer.from_pretrained(model_name)
617
    messages = [[{
618
619
620
621
622
623
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
624
            "text": question
625
        }]
626
    }] for question in questions]
627
628
629
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
630
631
632
633
634

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
635
636


637
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
638
639
640
641
642
643
644
645
646
647
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
648
        limit_mm_per_prompt={"image": 1},
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


673
# Molmo
674
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
675
676
    assert modality == "image"

677
    model_name = "allenai/Molmo-7B-D-0924"
678

679
    engine_args = EngineArgs(
680
        model=model_name,
681
        trust_remote_code=True,
682
        dtype="bfloat16",
683
        limit_mm_per_prompt={"image": 1},
684
    )
685

686
687
688
689
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
690
691
692
693
694

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
695
696


697
# NVLM-D
698
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
699
700
701
702
703
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
704
    engine_args = EngineArgs(
705
706
707
708
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
709
        limit_mm_per_prompt={"image": 1},
710
711
712
713
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
714
    messages = [[{
715
716
        'role': 'user',
        'content': f"<image>\n{question}"
717
    }] for question in questions]
718
719
720
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
721
722
723
724
725

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
726
727


728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
# Ovis2
def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
        limit_mm_per_prompt={"image": 1},
    )

    placeholder = "<image>\n"
    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
                f"<|im_start|>user\n{placeholder}"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


756
# PaliGemma
757
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
758
    assert modality == "image"
759

760
    # PaliGemma has special prompt format for VQA
761
762
763
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
764
765
        limit_mm_per_prompt={"image": 1},
    )
766
767
768
769
770

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
771
772


773
# PaliGemma 2
774
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
775
    assert modality == "image"
776

777
    # PaliGemma 2 has special prompt format for VQA
778
779
780
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
781
782
        limit_mm_per_prompt={"image": 1},
    )
783
784
785
786
787

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
788
789


790
# Phi-3-Vision
791
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
792
793
    assert modality == "image"

794
795
796
797
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
798

799
800
801
802
803
804
805
806
807
808
809
810
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
811
    engine_args = EngineArgs(
812
813
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
814
        max_model_len=4096,
815
        max_num_seqs=2,
816
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
817
        mm_processor_kwargs={"num_crops": 16},
818
        limit_mm_per_prompt={"image": 1},
819
    )
820
821
822
823
824

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
825
826


827
# Phi-4-multimodal-instruct
828
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
829
830
831
832
833
834
835
836
837
838
839
840
841
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
842
    engine_args = EngineArgs(
843
844
        model=model_path,
        trust_remote_code=True,
845
        max_model_len=5120,
846
        max_num_seqs=2,
847
        max_num_batched_tokens=12800,
848
849
        enable_lora=True,
        max_lora_rank=320,
850
851
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
852
        limit_mm_per_prompt={"image": 1},
853
854
    )

855
856
857
858
859
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
860
861


862
# Pixtral HF-format
863
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
864
865
866
867
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

868
    # NOTE: Need L40 (or equivalent) to avoid OOM
869
    engine_args = EngineArgs(
870
        model=model_name,
871
        max_model_len=6144,
872
        max_num_seqs=2,
873
        limit_mm_per_prompt={"image": 1},
874
875
    )

876
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
877
878
879
880
881

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
882
883


884
# Qwen
885
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
886
887
    assert modality == "image"

888
    engine_args = EngineArgs(
889
        model="Qwen/Qwen-VL",
890
        trust_remote_code=True,
891
892
        max_model_len=1024,
        max_num_seqs=2,
893
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
894
        limit_mm_per_prompt={"image": 1},
895
896
    )

897
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
898
899
900
901
902

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
903
904


905
# Qwen2-VL
906
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
907

908
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
909

910
    engine_args = EngineArgs(
911
        model=model_name,
912
913
914
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
915
        mm_processor_kwargs={
916
917
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
918
        },
919
        limit_mm_per_prompt={"image": 1},
920
    )
921

922
923
924
925
926
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

927
928
929
930
931
932
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
933
934
935
936
937

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
938
939


Roger Wang's avatar
Roger Wang committed
940
# Qwen2.5-VL
941
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
942
943
944

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

945
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
946
947
948
949
950
951
952
953
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
954
        limit_mm_per_prompt={"image": 1},
Roger Wang's avatar
Roger Wang committed
955
956
957
958
959
960
961
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

962
963
964
965
966
967
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
968
969
970
971
972

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
973
974


975
976
977
978
979
980
981
982
983
984
985
986
987
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
Cyrus Leung's avatar
Cyrus Leung committed
988
        limit_mm_per_prompt={"image": 1},
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")

    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
1021
        limit_mm_per_prompt={"image": 1},
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


1046
model_example_map = {
1047
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1048
    "aya_vision": run_aya_vision,
1049
1050
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
1051
    "deepseek_vl_v2": run_deepseek_vl2,
1052
    "florence2": run_florence2,
1053
    "fuyu": run_fuyu,
1054
    "gemma3": run_gemma3,
1055
1056
1057
1058
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
1059
    "kimi_vl": run_kimi_vl,
1060
1061
    "llava": run_llava,
    "llava-next": run_llava_next,
1062
    "llava-next-video": run_llava_next_video,
1063
    "llava-onevision": run_llava_onevision,
1064
    "mantis": run_mantis,
1065
    "minicpmo": run_minicpmo,
1066
    "minicpmv": run_minicpmv,
1067
    "mistral3": run_mistral3,
1068
    "mllama": run_mllama,
1069
    "llama4": run_llama4,
1070
    "molmo": run_molmo,
1071
    "NVLM_D": run_nvlm_d,
1072
    "ovis2": run_ovis2,
1073
1074
1075
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1076
    "phi4_mm": run_phi4mm,
1077
    "pixtral_hf": run_pixtral_hf,
1078
    "qwen_vl": run_qwen_vl,
1079
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1080
    "qwen2_5_vl": run_qwen2_5_vl,
1081
    "qwen2_5_omni": run_qwen2_5_omni,
1082
    "skywork_chat": run_skyworkr1v,
1083
    "smolvlm": run_smolvlm,
1084
1085
1086
}


1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
1098
1099
1100
1101
1102
1103
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1104
1105
1106

        return {
            "data": image,
1107
            "questions": img_questions,
1108
1109
1110
1111
        }

    if args.modality == "video":
        # Input video and question
1112
        video = VideoAsset(name="sample_demo_1",
1113
                           num_frames=args.num_frames).np_ndarrays
1114
        vid_questions = ["Why is this video funny?"]
1115
1116
1117

        return {
            "data": video,
1118
            "questions": vid_questions,
1119
1120
1121
1122
1123
1124
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1125
1126
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1146
            "prompt": prompts[i % len(prompts)],
1147
1148
1149
1150
1151
1152
1153
1154
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1222
1223
1224
1225
1226
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1227
1228
1229
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1230
    questions = mm_input["questions"]
1231

1232
1233
    req_data = model_example_map[model](questions, modality)

1234
1235
1236
1237
1238
1239
1240
1241
1242
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1243
1244
    llm = LLM(**engine_args)

1245
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1246
1247
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1248
    ]
1249
1250
1251

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1252
1253
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1254
                                     stop_token_ids=req_data.stop_token_ids)
1255
1256
1257
1258
1259

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1260
            "prompt": prompts[0],
1261
            "multi_modal_data": {
1262
                modality: data
1263
1264
1265
1266
            },
        }
    else:
        # Batch inference
1267
1268
1269
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1270
                                        args.num_prompts, data, prompts,
1271
1272
1273
1274
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1275
                "prompt": prompts[i % len(prompts)],
1276
1277
1278
                "multi_modal_data": {
                    modality: data
                },
1279
            } for i in range(args.num_prompts)]
1280

1281
1282
1283
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1284

1285
1286
1287
1288
1289
1290
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1291

1292
    print("-" * 50)
1293
1294
1295
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1296
        print("-" * 50)
1297
1298
1299


if __name__ == "__main__":
1300
    args = parse_args()
1301
    main(args)