"docs/vscode:/vscode.git/clone" did not exist on "eb0b2d2f08b622f4b93fb0a811a047ad987a46ca"
vision_language.py 35.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
23
from vllm.utils import FlexibleArgumentParser

24
25
26
27
28
29
30
31

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


32
33
34
35
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

36

37
# Aria
38
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
39
40
41
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

42
    # NOTE: Need L40 (or equivalent) to avoid OOM
43
44
45
46
47
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
48
        limit_mm_per_prompt={"image": 1},
49
    )
50

51
52
53
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
54
55

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
56
57
58
59
60
61

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
62
63


Jennifer Zhao's avatar
Jennifer Zhao committed
64
65
66
67
68
69
70
71
72
73
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
74
        limit_mm_per_prompt={"image": 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
75
76
77
78
79
80
81
82
83
84
85
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


86
# BLIP-2
87
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
88
89
90
91
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
92
    prompts = [f"Question: {question} Answer:" for question in questions]
93
    engine_args = EngineArgs(
94
        model="Salesforce/blip2-opt-6.7b",
95
        limit_mm_per_prompt={"image": 1},
96
97
98
99
100
101
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
102
103
104


# Chameleon
105
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
106
107
    assert modality == "image"

108
    prompts = [f"{question}<image>" for question in questions]
109
110
111
112
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
113
        limit_mm_per_prompt={"image": 1},
114
115
116
117
118
119
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
120
121


122
# Deepseek-VL2
123
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
124
125
    assert modality == "image"

126
    model_name = "deepseek-ai/deepseek-vl2-tiny"
127

128
129
130
131
132
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133
        limit_mm_per_prompt={"image": 1},
134
    )
135

136
137
138
139
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
140
141
142
143
144

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
145
146


147
# Florence2
148
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
149
150
    assert modality == "image"

151
152
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
153
        tokenizer="Isotr0py/Florence-2-tokenizer",
154
155
        max_model_len=4096,
        max_num_seqs=2,
156
157
        trust_remote_code=True,
        dtype="bfloat16",
158
        limit_mm_per_prompt={"image": 1},
159
    )
160

161
162
163
164
165
166
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
167
168


169
# Fuyu
170
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
171
172
    assert modality == "image"

173
    prompts = [f"{question}\n" for question in questions]
174
175
176
177
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
178
        limit_mm_per_prompt={"image": 1},
179
180
181
182
183
184
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# Gemma 3
188
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

192
    engine_args = EngineArgs(
193
194
195
196
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
197
        limit_mm_per_prompt={"image": 1},
198
    )
199
200
201
202

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
203
204
205
206
207

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
208
209


210
# GLM-4v
211
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
212
213
214
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

215
216
217
218
219
220
221
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222
        limit_mm_per_prompt={"image": 1},
223
    )
224

225
226
227
228
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
229

230
    stop_token_ids = [151329, 151336, 151338]
231
232
233
234
235
236

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
237
238
239


# H2OVL-Mississippi
240
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
241
242
    assert modality == "image"

243
    model_name = "h2oai/h2ovl-mississippi-800m"
244

245
    engine_args = EngineArgs(
246
247
248
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
249
        limit_mm_per_prompt={"image": 1},
250
251
252
253
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
254
255
256
257
258
259
260
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
261
262

    # Stop tokens for H2OVL-Mississippi
263
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
264
    stop_token_ids = [tokenizer.eos_token_id]
265
266
267
268
269
270

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
271
272
273


# Idefics3-8B-Llama3
274
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
275
276
277
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

278
    engine_args = EngineArgs(
279
280
281
282
283
284
285
286
287
288
289
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
290
        limit_mm_per_prompt={"image": 1},
291
    )
292
    prompts = [(
293
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
294
    ) for question in questions]
295
296
297
298
299

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
317
        limit_mm_per_prompt={"image": 1},
318
319
320
321
322
323
324
325
326
327
328
329
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


330
# InternVL
331
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
332
333
334
335
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

336
    engine_args = EngineArgs(
337
338
339
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
340
        limit_mm_per_prompt={"image": 1},
341
342
343
344
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
345
346
347
348
349
350
351
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
352
353
354
355
356
357
358

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
359
360
361
362
363
364

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
365
366


367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        max_model_len=4096,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
        trust_remote_code=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


390
# LLaVA-1.5
391
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
392
    assert modality == "image"
393

394
395
396
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
397

398
399
400
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
401
        limit_mm_per_prompt={"image": 1},
402
403
404
405
406
407
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
408
409
410


# LLaVA-1.6/LLaVA-NeXT
411
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
412
    assert modality == "image"
413

414
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
415
416
417
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
418
        limit_mm_per_prompt={"image": 1},
419
420
421
422
423
424
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
425
426
427
428


# LlaVA-NeXT-Video
# Currently only support for video input
429
430
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
431
432
    assert modality == "video"

433
434
435
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
436
437
438
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
439
        max_num_seqs=2,
440
        limit_mm_per_prompt={"image": 1},
441
442
443
444
445
446
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
447
448


449
# LLaVA-OneVision
450
451
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
452
453

    if modality == "video":
454
455
456
457
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
458
459

    elif modality == "image":
460
461
462
463
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
464

465
466
467
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
468
        limit_mm_per_prompt={"image": 1},
469
470
471
472
473
474
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
475
476


477
# Mantis
478
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
479
    assert modality == "image"
480

481
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
482
483
484
485
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
486

487
    engine_args = EngineArgs(
488
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
489
        max_model_len=4096,
490
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
491
        limit_mm_per_prompt={"image": 1},
492
    )
493
    stop_token_ids = [128009]
494
495
496
497
498
499

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
500
501
502


# MiniCPM-V
503
def run_minicpmv_base(questions: list[str], modality: str, model_name):
504
505
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
506
507
508
509
510
511
512

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
513
514
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

515
    # 2.6
516
517
518
519
520
521
522
523
524
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
525
526
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
527
    engine_args = EngineArgs(
528
        model=model_name,
529
530
        max_model_len=4096,
        max_num_seqs=2,
531
        trust_remote_code=True,
532
        limit_mm_per_prompt={"image": 1},
533
    )
534
535
536
537
538
539
540
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

541
    # 2.6 / o2.6
542
543
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
544

545
546
547
548
549
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

550
551
552
553
554
555
556
557
558
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
559
560
561
562
563
564

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
565
566


567
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
568
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
569
570


571
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
572
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
573
574


575
576
577
578
579
580
581
582
583
584
585
586
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
587
        limit_mm_per_prompt={"image": 1},
588
589
590
591
592
593
594
595
596
597
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


598
# LLama 3.2
599
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
600
601
    assert modality == "image"

602
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
603

604
605
606
607
608
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
609
    engine_args = EngineArgs(
610
        model=model_name,
611
        max_model_len=8192,
612
        max_num_seqs=2,
613
        limit_mm_per_prompt={"image": 1},
614
615
    )

616
    tokenizer = AutoTokenizer.from_pretrained(model_name)
617
    messages = [[{
618
619
620
621
622
623
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
624
            "text": question
625
        }]
626
    }] for question in questions]
627
628
629
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
630
631
632
633
634

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
635
636


637
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
638
639
640
641
642
643
644
645
646
647
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
648
        limit_mm_per_prompt={"image": 1},
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


673
# Molmo
674
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
675
676
    assert modality == "image"

677
    model_name = "allenai/Molmo-7B-D-0924"
678

679
    engine_args = EngineArgs(
680
        model=model_name,
681
        trust_remote_code=True,
682
        dtype="bfloat16",
683
        limit_mm_per_prompt={"image": 1},
684
    )
685

686
687
688
689
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
690
691
692
693
694

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
695
696


697
# NVLM-D
698
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
699
700
701
702
703
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
704
    engine_args = EngineArgs(
705
706
707
708
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
709
        limit_mm_per_prompt={"image": 1},
710
711
712
713
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
714
    messages = [[{
715
716
        'role': 'user',
        'content': f"<image>\n{question}"
717
    }] for question in questions]
718
719
720
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
721
722
723
724
725

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
726
727


728
# PaliGemma
729
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
730
    assert modality == "image"
731

732
    # PaliGemma has special prompt format for VQA
733
734
735
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
736
737
        limit_mm_per_prompt={"image": 1},
    )
738
739
740
741
742

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
743
744


745
# PaliGemma 2
746
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
747
    assert modality == "image"
748

749
    # PaliGemma 2 has special prompt format for VQA
750
751
752
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
753
754
        limit_mm_per_prompt={"image": 1},
    )
755
756
757
758
759

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
760
761


762
# Phi-3-Vision
763
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
764
765
    assert modality == "image"

766
767
768
769
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
770

771
772
773
774
775
776
777
778
779
780
781
782
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
783
    engine_args = EngineArgs(
784
785
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
786
        max_model_len=4096,
787
        max_num_seqs=2,
788
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
789
        mm_processor_kwargs={"num_crops": 16},
790
        limit_mm_per_prompt={"image": 1},
791
    )
792
793
794
795
796

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
797
798


799
# Phi-4-multimodal-instruct
800
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
801
802
803
804
805
806
807
808
809
810
811
812
813
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
814
    engine_args = EngineArgs(
815
816
817
818
819
820
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
821
        limit_mm_per_prompt={"image": 1},
822
823
    )

824
825
826
827
828
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
829
830


831
# Pixtral HF-format
832
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
833
834
835
836
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

837
    # NOTE: Need L40 (or equivalent) to avoid OOM
838
    engine_args = EngineArgs(
839
        model=model_name,
840
        max_model_len=6144,
841
        max_num_seqs=2,
842
        limit_mm_per_prompt={"image": 1},
843
844
    )

845
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
846
847
848
849
850

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
851
852


853
# Qwen
854
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
855
856
    assert modality == "image"

857
    engine_args = EngineArgs(
858
        model="Qwen/Qwen-VL",
859
        trust_remote_code=True,
860
861
        max_model_len=1024,
        max_num_seqs=2,
862
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
863
        limit_mm_per_prompt={"image": 1},
864
865
    )

866
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
867
868
869
870
871

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
872
873


874
# Qwen2-VL
875
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
876

877
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
878

879
    engine_args = EngineArgs(
880
        model=model_name,
881
882
883
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
884
        mm_processor_kwargs={
885
886
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
887
        },
888
        limit_mm_per_prompt={"image": 1},
889
    )
890

891
892
893
894
895
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

896
897
898
899
900
901
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
902
903
904
905
906

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
907
908


Roger Wang's avatar
Roger Wang committed
909
# Qwen2.5-VL
910
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
911
912
913

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

914
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
915
916
917
918
919
920
921
922
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
923
        limit_mm_per_prompt={"image": 1},
Roger Wang's avatar
Roger Wang committed
924
925
926
927
928
929
930
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

931
932
933
934
935
936
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
937
938
939
940
941

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
942
943


944
945
946
947
948
949
950
951
952
953
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
954
        limit_mm_per_prompt={"image": 1},
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


979
model_example_map = {
980
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
981
    "aya_vision": run_aya_vision,
982
983
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
984
    "deepseek_vl_v2": run_deepseek_vl2,
985
    "florence2": run_florence2,
986
    "fuyu": run_fuyu,
987
    "gemma3": run_gemma3,
988
989
990
991
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
992
    "kimi_vl": run_kimi_vl,
993
994
    "llava": run_llava,
    "llava-next": run_llava_next,
995
    "llava-next-video": run_llava_next_video,
996
    "llava-onevision": run_llava_onevision,
997
    "mantis": run_mantis,
998
    "minicpmo": run_minicpmo,
999
    "minicpmv": run_minicpmv,
1000
    "mistral3": run_mistral3,
1001
    "mllama": run_mllama,
1002
    "llama4": run_llama4,
1003
    "molmo": run_molmo,
1004
    "NVLM_D": run_nvlm_d,
1005
1006
1007
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1008
    "phi4_mm": run_phi4mm,
1009
    "pixtral_hf": run_pixtral_hf,
1010
    "qwen_vl": run_qwen_vl,
1011
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1012
    "qwen2_5_vl": run_qwen2_5_vl,
1013
    "skywork_chat": run_skyworkr1v,
1014
    "smolvlm": run_smolvlm,
1015
1016
1017
}


1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
1029
1030
1031
1032
1033
1034
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1035
1036
1037

        return {
            "data": image,
1038
            "questions": img_questions,
1039
1040
1041
1042
1043
1044
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
1045
        vid_questions = ["Why is this video funny?"]
1046
1047
1048

        return {
            "data": video,
1049
            "questions": vid_questions,
1050
1051
1052
1053
1054
1055
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1056
1057
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1077
            "prompt": prompts[i % len(prompts)],
1078
1079
1080
1081
1082
1083
1084
1085
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1153
1154
1155
1156
1157
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1158
1159
1160
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1161
    questions = mm_input["questions"]
1162

1163
1164
    req_data = model_example_map[model](questions, modality)

1165
1166
1167
1168
1169
1170
1171
1172
1173
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1174
1175
    llm = LLM(**engine_args)

1176
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1177
1178
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1179
    ]
1180
1181
1182

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1183
1184
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1185
                                     stop_token_ids=req_data.stop_token_ids)
1186
1187
1188
1189
1190

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1191
            "prompt": prompts[0],
1192
            "multi_modal_data": {
1193
                modality: data
1194
1195
1196
1197
            },
        }
    else:
        # Batch inference
1198
1199
1200
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1201
                                        args.num_prompts, data, prompts,
1202
1203
1204
1205
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1206
                "prompt": prompts[i % len(prompts)],
1207
1208
1209
                "multi_modal_data": {
                    modality: data
                },
1210
            } for i in range(args.num_prompts)]
1211

1212
1213
1214
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1215

1216
1217
1218
1219
1220
1221
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1222

1223
    print("-" * 50)
1224
1225
1226
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1227
        print("-" * 50)
1228
1229
1230


if __name__ == "__main__":
1231
    args = parse_args()
1232
    main(args)