vision_language.py 37.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
23
from vllm.utils import FlexibleArgumentParser

24
25
26
27
28
29
30
31

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


32
33
34
35
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

36

37
# Aria
38
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
39
40
41
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

42
    # NOTE: Need L40 (or equivalent) to avoid OOM
43
44
45
46
47
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
48
        limit_mm_per_prompt={"image": 1},
49
    )
50

51
52
53
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
54
55

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
56
57
58
59
60
61

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
62
63


Jennifer Zhao's avatar
Jennifer Zhao committed
64
65
66
67
68
69
70
71
72
73
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
74
        limit_mm_per_prompt={"image": 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
75
76
77
78
79
80
81
82
83
84
85
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


86
# BLIP-2
87
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
88
89
90
91
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
92
    prompts = [f"Question: {question} Answer:" for question in questions]
93
    engine_args = EngineArgs(
94
        model="Salesforce/blip2-opt-6.7b",
95
        limit_mm_per_prompt={"image": 1},
96
97
98
99
100
101
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
102
103
104


# Chameleon
105
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
106
107
    assert modality == "image"

108
    prompts = [f"{question}<image>" for question in questions]
109
110
111
112
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
113
        limit_mm_per_prompt={"image": 1},
114
115
116
117
118
119
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
120
121


122
# Deepseek-VL2
123
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
124
125
    assert modality == "image"

126
    model_name = "deepseek-ai/deepseek-vl2-tiny"
127

128
129
130
131
132
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133
        limit_mm_per_prompt={"image": 1},
134
    )
135

136
137
138
139
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
140
141
142
143
144

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
145
146


147
# Florence2
148
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
149
150
    assert modality == "image"

151
152
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
153
        tokenizer="Isotr0py/Florence-2-tokenizer",
154
155
        max_model_len=4096,
        max_num_seqs=2,
156
157
        trust_remote_code=True,
        dtype="bfloat16",
158
        limit_mm_per_prompt={"image": 1},
159
    )
160

161
162
163
164
165
166
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
167
168


169
# Fuyu
170
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
171
172
    assert modality == "image"

173
    prompts = [f"{question}\n" for question in questions]
174
175
176
177
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
178
        limit_mm_per_prompt={"image": 1},
179
180
181
182
183
184
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# Gemma 3
188
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

192
    engine_args = EngineArgs(
193
194
195
196
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
197
        limit_mm_per_prompt={"image": 1},
198
    )
199
200
201
202

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
203
204
205
206
207

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
208
209


210
# GLM-4v
211
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
212
213
214
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

215
216
217
218
219
220
221
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222
        limit_mm_per_prompt={"image": 1},
223
    )
224

225
226
227
228
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
229

230
    stop_token_ids = [151329, 151336, 151338]
231
232
233
234
235
236

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
237
238
239


# H2OVL-Mississippi
240
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
241
242
    assert modality == "image"

243
    model_name = "h2oai/h2ovl-mississippi-800m"
244

245
    engine_args = EngineArgs(
246
247
248
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
249
        limit_mm_per_prompt={"image": 1},
250
251
252
253
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
254
255
256
257
258
259
260
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
261
262

    # Stop tokens for H2OVL-Mississippi
263
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
264
    stop_token_ids = [tokenizer.eos_token_id]
265
266
267
268
269
270

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
271
272
273


# Idefics3-8B-Llama3
274
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
275
276
277
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

278
    engine_args = EngineArgs(
279
280
281
282
283
284
285
286
287
288
289
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
290
        limit_mm_per_prompt={"image": 1},
291
    )
292
    prompts = [(
293
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
294
    ) for question in questions]
295
296
297
298
299

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
317
        limit_mm_per_prompt={"image": 1},
318
319
320
321
322
323
324
325
326
327
328
329
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


330
# InternVL
331
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
332
333
334
335
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

336
    engine_args = EngineArgs(
337
338
339
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
340
        limit_mm_per_prompt={"image": 1},
341
342
343
344
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
345
346
347
348
349
350
351
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
352
353
354
355
356
357
358

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
359
360
361
362
363
364

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
365
366


367
368
369
370
371
372
373
374
375
376
377
378
379
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
Cyrus Leung's avatar
Cyrus Leung committed
380
381
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
382
383
384
385
386
387
388
389
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


390
# LLaVA-1.5
391
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
392
    assert modality == "image"
393

394
395
396
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
397

398
399
400
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
401
        limit_mm_per_prompt={"image": 1},
402
403
404
405
406
407
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
408
409
410


# LLaVA-1.6/LLaVA-NeXT
411
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
412
    assert modality == "image"
413

414
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
415
416
417
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
418
        limit_mm_per_prompt={"image": 1},
419
420
421
422
423
424
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
425
426
427
428


# LlaVA-NeXT-Video
# Currently only support for video input
429
430
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
431
432
    assert modality == "video"

433
434
435
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
436
437
438
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
439
        max_num_seqs=2,
440
        limit_mm_per_prompt={"image": 1},
441
442
443
444
445
446
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
447
448


449
# LLaVA-OneVision
450
451
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
452
453

    if modality == "video":
454
455
456
457
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
458
459

    elif modality == "image":
460
461
462
463
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
464

465
466
467
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
468
        limit_mm_per_prompt={"image": 1},
469
470
471
472
473
474
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
475
476


477
# Mantis
478
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
479
    assert modality == "image"
480

481
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
482
483
484
485
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
486

487
    engine_args = EngineArgs(
488
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
489
        max_model_len=4096,
490
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
491
        limit_mm_per_prompt={"image": 1},
492
    )
493
    stop_token_ids = [128009]
494
495
496
497
498
499

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
500
501
502


# MiniCPM-V
503
def run_minicpmv_base(questions: list[str], modality: str, model_name):
504
505
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
506
507
508
509
510
511
512

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
513
514
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

515
    # 2.6
516
517
518
519
520
521
522
523
524
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
525
526
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
527
    engine_args = EngineArgs(
528
        model=model_name,
529
530
        max_model_len=4096,
        max_num_seqs=2,
531
        trust_remote_code=True,
532
        limit_mm_per_prompt={"image": 1},
533
    )
534
535
536
537
538
539
540
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

541
    # 2.6 / o2.6
542
543
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
544

545
546
547
548
549
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

550
551
552
553
554
555
556
557
558
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
559
560
561
562
563
564

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
565
566


567
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
568
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
569
570


571
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
572
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
573
574


575
576
577
578
579
580
581
582
583
584
585
586
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
587
        limit_mm_per_prompt={"image": 1},
588
589
590
591
592
593
594
595
596
597
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


598
# LLama 3.2
599
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
600
601
    assert modality == "image"

602
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
603

604
605
606
607
608
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
609
    engine_args = EngineArgs(
610
        model=model_name,
611
        max_model_len=8192,
612
        max_num_seqs=2,
613
        limit_mm_per_prompt={"image": 1},
614
615
    )

616
    tokenizer = AutoTokenizer.from_pretrained(model_name)
617
    messages = [[{
618
619
620
621
622
623
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
624
            "text": question
625
        }]
626
    }] for question in questions]
627
628
629
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
630
631
632
633
634

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
635
636


637
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
638
639
640
641
642
643
644
645
646
647
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
648
        limit_mm_per_prompt={"image": 1},
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


673
# Molmo
674
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
675
676
    assert modality == "image"

677
    model_name = "allenai/Molmo-7B-D-0924"
678

679
    engine_args = EngineArgs(
680
        model=model_name,
681
        trust_remote_code=True,
682
        dtype="bfloat16",
683
        limit_mm_per_prompt={"image": 1},
684
    )
685

686
687
688
689
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
690
691
692
693
694

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
695
696


697
# NVLM-D
698
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
699
700
701
702
703
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
704
    engine_args = EngineArgs(
705
706
707
708
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
709
        limit_mm_per_prompt={"image": 1},
710
711
712
713
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
714
    messages = [[{
715
716
        'role': 'user',
        'content': f"<image>\n{question}"
717
    }] for question in questions]
718
719
720
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
721
722
723
724
725

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
726
727


728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
# Ovis2
def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"
    tokenizer = "Isotr0py/Ovis2-tokenizer"

    engine_args = EngineArgs(
        model=model_name,
        tokenizer=tokenizer,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
        limit_mm_per_prompt={"image": 1},
    )

    placeholder = "<image>\n"
    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
                f"<|im_start|>user\n{placeholder}"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


758
# PaliGemma
759
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
760
    assert modality == "image"
761

762
    # PaliGemma has special prompt format for VQA
763
764
765
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
766
767
        limit_mm_per_prompt={"image": 1},
    )
768
769
770
771
772

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
773
774


775
# PaliGemma 2
776
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
777
    assert modality == "image"
778

779
    # PaliGemma 2 has special prompt format for VQA
780
781
782
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
783
784
        limit_mm_per_prompt={"image": 1},
    )
785
786
787
788
789

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
790
791


792
# Phi-3-Vision
793
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
794
795
    assert modality == "image"

796
797
798
799
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
800

801
802
803
804
805
806
807
808
809
810
811
812
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
813
    engine_args = EngineArgs(
814
815
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
816
        max_model_len=4096,
817
        max_num_seqs=2,
818
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
819
        mm_processor_kwargs={"num_crops": 16},
820
        limit_mm_per_prompt={"image": 1},
821
    )
822
823
824
825
826

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
827
828


829
# Phi-4-multimodal-instruct
830
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
831
832
833
834
835
836
837
838
839
840
841
842
843
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
844
    engine_args = EngineArgs(
845
846
        model=model_path,
        trust_remote_code=True,
847
        max_model_len=5120,
848
        max_num_seqs=2,
849
        max_num_batched_tokens=12800,
850
851
        enable_lora=True,
        max_lora_rank=320,
852
853
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
854
        limit_mm_per_prompt={"image": 1},
855
856
    )

857
858
859
860
861
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
862
863


864
# Pixtral HF-format
865
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
866
867
868
869
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

870
    # NOTE: Need L40 (or equivalent) to avoid OOM
871
    engine_args = EngineArgs(
872
        model=model_name,
873
        max_model_len=6144,
874
        max_num_seqs=2,
875
        limit_mm_per_prompt={"image": 1},
876
877
    )

878
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
879
880
881
882
883

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
884
885


886
# Qwen
887
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
888
889
    assert modality == "image"

890
    engine_args = EngineArgs(
891
        model="Qwen/Qwen-VL",
892
        trust_remote_code=True,
893
894
        max_model_len=1024,
        max_num_seqs=2,
895
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
896
        limit_mm_per_prompt={"image": 1},
897
898
    )

899
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
900
901
902
903
904

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
905
906


907
# Qwen2-VL
908
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
909

910
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
911

912
    engine_args = EngineArgs(
913
        model=model_name,
914
915
916
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
917
        mm_processor_kwargs={
918
919
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
920
        },
921
        limit_mm_per_prompt={"image": 1},
922
    )
923

924
925
926
927
928
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

929
930
931
932
933
934
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
935
936
937
938
939

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
940
941


Roger Wang's avatar
Roger Wang committed
942
# Qwen2.5-VL
943
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
944
945
946

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

947
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
948
949
950
951
952
953
954
955
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
956
        limit_mm_per_prompt={"image": 1},
Roger Wang's avatar
Roger Wang committed
957
958
959
960
961
962
963
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

964
965
966
967
968
969
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
970
971
972
973
974

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
975
976


977
978
979
980
981
982
983
984
985
986
987
988
989
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
Cyrus Leung's avatar
Cyrus Leung committed
990
        limit_mm_per_prompt={"image": 1},
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")

    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
1023
        limit_mm_per_prompt={"image": 1},
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


1048
model_example_map = {
1049
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1050
    "aya_vision": run_aya_vision,
1051
1052
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
1053
    "deepseek_vl_v2": run_deepseek_vl2,
1054
    "florence2": run_florence2,
1055
    "fuyu": run_fuyu,
1056
    "gemma3": run_gemma3,
1057
1058
1059
1060
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
1061
    "kimi_vl": run_kimi_vl,
1062
1063
    "llava": run_llava,
    "llava-next": run_llava_next,
1064
    "llava-next-video": run_llava_next_video,
1065
    "llava-onevision": run_llava_onevision,
1066
    "mantis": run_mantis,
1067
    "minicpmo": run_minicpmo,
1068
    "minicpmv": run_minicpmv,
1069
    "mistral3": run_mistral3,
1070
    "mllama": run_mllama,
1071
    "llama4": run_llama4,
1072
    "molmo": run_molmo,
1073
    "NVLM_D": run_nvlm_d,
1074
    "ovis2": run_ovis2,
1075
1076
1077
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1078
    "phi4_mm": run_phi4mm,
1079
    "pixtral_hf": run_pixtral_hf,
1080
    "qwen_vl": run_qwen_vl,
1081
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1082
    "qwen2_5_vl": run_qwen2_5_vl,
1083
    "qwen2_5_omni": run_qwen2_5_omni,
1084
    "skywork_chat": run_skyworkr1v,
1085
    "smolvlm": run_smolvlm,
1086
1087
1088
}


1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
1100
1101
1102
1103
1104
1105
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1106
1107
1108

        return {
            "data": image,
1109
            "questions": img_questions,
1110
1111
1112
1113
1114
1115
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
1116
        vid_questions = ["Why is this video funny?"]
1117
1118
1119

        return {
            "data": video,
1120
            "questions": vid_questions,
1121
1122
1123
1124
1125
1126
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1127
1128
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1148
            "prompt": prompts[i % len(prompts)],
1149
1150
1151
1152
1153
1154
1155
1156
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1224
1225
1226
1227
1228
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1229
1230
1231
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1232
    questions = mm_input["questions"]
1233

1234
1235
    req_data = model_example_map[model](questions, modality)

1236
1237
1238
1239
1240
1241
1242
1243
1244
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1245
1246
    llm = LLM(**engine_args)

1247
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1248
1249
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1250
    ]
1251
1252
1253

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1254
1255
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1256
                                     stop_token_ids=req_data.stop_token_ids)
1257
1258
1259
1260
1261

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1262
            "prompt": prompts[0],
1263
            "multi_modal_data": {
1264
                modality: data
1265
1266
1267
1268
            },
        }
    else:
        # Batch inference
1269
1270
1271
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1272
                                        args.num_prompts, data, prompts,
1273
1274
1275
1276
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1277
                "prompt": prompts[i % len(prompts)],
1278
1279
1280
                "multi_modal_data": {
                    modality: data
                },
1281
            } for i in range(args.num_prompts)]
1282

1283
1284
1285
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1286

1287
1288
1289
1290
1291
1292
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1293

1294
    print("-" * 50)
1295
1296
1297
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1298
        print("-" * 50)
1299
1300
1301


if __name__ == "__main__":
1302
    args = parse_args()
1303
    main(args)