"vllm/vscode:/vscode.git/clone" did not exist on "f46864d68dfb46ff88f574e6844f10fdb14cd3b5"
vision_language.py 37.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
23
from vllm.utils import FlexibleArgumentParser

24
25
26
27
28
29
30
31

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


32
33
34
35
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

36

37
# Aria
38
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
39
40
41
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

42
    # NOTE: Need L40 (or equivalent) to avoid OOM
43
44
45
46
47
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
48
        limit_mm_per_prompt={modality: 1},
49
    )
50

51
52
53
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
54
55

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
56
57
58
59
60
61

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
62
63


Jennifer Zhao's avatar
Jennifer Zhao committed
64
65
66
67
68
69
70
71
72
73
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
74
        limit_mm_per_prompt={modality: 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
75
76
77
78
79
80
81
82
83
84
85
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


86
# BLIP-2
87
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
88
89
90
91
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
92
    prompts = [f"Question: {question} Answer:" for question in questions]
93
    engine_args = EngineArgs(
94
        model="Salesforce/blip2-opt-6.7b",
95
        limit_mm_per_prompt={modality: 1},
96
97
98
99
100
101
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
102
103
104


# Chameleon
105
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
106
107
    assert modality == "image"

108
    prompts = [f"{question}<image>" for question in questions]
109
110
111
112
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
113
        limit_mm_per_prompt={modality: 1},
114
115
116
117
118
119
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
120
121


122
# Deepseek-VL2
123
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
124
125
    assert modality == "image"

126
    model_name = "deepseek-ai/deepseek-vl2-tiny"
127

128
129
130
131
132
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133
        limit_mm_per_prompt={modality: 1},
134
    )
135

136
137
138
139
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
140
141
142
143
144

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
145
146


147
# Florence2
148
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
149
150
    assert modality == "image"

151
152
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
153
        tokenizer="Isotr0py/Florence-2-tokenizer",
154
155
        max_model_len=4096,
        max_num_seqs=2,
156
157
        trust_remote_code=True,
        dtype="bfloat16",
158
        limit_mm_per_prompt={modality: 1},
159
    )
160

161
162
163
164
165
166
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
167
168


169
# Fuyu
170
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
171
172
    assert modality == "image"

173
    prompts = [f"{question}\n" for question in questions]
174
175
176
177
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
178
        limit_mm_per_prompt={modality: 1},
179
180
181
182
183
184
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# Gemma 3
188
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

192
    engine_args = EngineArgs(
193
194
195
196
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
197
        limit_mm_per_prompt={modality: 1},
198
    )
199
200
201
202

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
203
204
205
206
207

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
208
209


210
# GLM-4v
211
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
212
213
214
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

215
216
217
218
219
220
221
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222
        limit_mm_per_prompt={modality: 1},
223
    )
224

225
226
227
228
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
229

230
    stop_token_ids = [151329, 151336, 151338]
231
232
233
234
235
236

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
237
238
239


# H2OVL-Mississippi
240
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
241
242
    assert modality == "image"

243
    model_name = "h2oai/h2ovl-mississippi-800m"
244

245
    engine_args = EngineArgs(
246
247
248
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
249
        limit_mm_per_prompt={modality: 1},
250
251
252
253
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
254
255
256
257
258
259
260
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
261
262

    # Stop tokens for H2OVL-Mississippi
263
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
264
    stop_token_ids = [tokenizer.eos_token_id]
265
266
267
268
269
270

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
271
272
273


# Idefics3-8B-Llama3
274
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
275
276
277
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

278
    engine_args = EngineArgs(
279
280
281
282
283
284
285
286
287
288
289
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
290
        limit_mm_per_prompt={modality: 1},
291
    )
292
    prompts = [(
293
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
294
    ) for question in questions]
295
296
297
298
299

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
317
        limit_mm_per_prompt={modality: 1},
318
319
320
321
322
323
324
325
326
327
328
329
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


330
# InternVL
331
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
332
333
334
335
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

336
    engine_args = EngineArgs(
337
338
339
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
340
        limit_mm_per_prompt={modality: 1},
341
342
343
344
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
345
346
347
348
349
350
351
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
352
353
354
355
356
357
358

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
359
360
361
362
363
364

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
365
366


367
368
369
370
371
372
373
374
375
376
377
378
379
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
Cyrus Leung's avatar
Cyrus Leung committed
380
        max_model_len=4096,
381
        limit_mm_per_prompt={modality: 1},
382
383
384
385
386
387
388
389
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


390
# LLaVA-1.5
391
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
392
    assert modality == "image"
393

394
395
396
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
397

398
399
400
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
401
        limit_mm_per_prompt={modality: 1},
402
403
404
405
406
407
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
408
409
410


# LLaVA-1.6/LLaVA-NeXT
411
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
412
    assert modality == "image"
413

414
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
415
416
417
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
418
        limit_mm_per_prompt={modality: 1},
419
420
421
422
423
424
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
425
426
427
428


# LlaVA-NeXT-Video
# Currently only support for video input
429
430
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
431
432
    assert modality == "video"

433
434
435
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
436
437
438
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
439
        max_num_seqs=2,
440
        limit_mm_per_prompt={modality: 1},
441
442
443
444
445
446
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
447
448


449
# LLaVA-OneVision
450
451
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
452
453

    if modality == "video":
454
455
456
457
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
458
459

    elif modality == "image":
460
461
462
463
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
464

465
466
467
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
468
        limit_mm_per_prompt={modality: 1},
469
470
471
472
473
474
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
475
476


477
# Mantis
478
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
479
    assert modality == "image"
480

481
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
482
483
484
485
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
486

487
    engine_args = EngineArgs(
488
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
489
        max_model_len=4096,
490
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
491
        limit_mm_per_prompt={modality: 1},
492
    )
493
    stop_token_ids = [128009]
494
495
496
497
498
499

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
500
501
502


# MiniCPM-V
503
def run_minicpmv_base(questions: list[str], modality: str, model_name):
504
505
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
506
507
508
509
510
511
512

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
513
514
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

515
    # 2.6
516
517
518
519
520
521
522
523
524
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
525
526
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
527
    engine_args = EngineArgs(
528
        model=model_name,
529
530
        max_model_len=4096,
        max_num_seqs=2,
531
        trust_remote_code=True,
532
        limit_mm_per_prompt={modality: 1},
533
    )
534
535
536
537
538
539
540
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

541
    # 2.6 / o2.6
542
543
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
544

545
546
547
548
549
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

550
551
552
553
554
555
556
557
558
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
559
560
561
562
563
564

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
565
566


567
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
568
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
569
570


571
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
572
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
573
574


575
576
577
578
579
580
581
582
583
584
585
586
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
587
        limit_mm_per_prompt={modality: 1},
588
589
590
591
592
593
594
595
596
597
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


598
# LLama 3.2
599
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
600
601
    assert modality == "image"

602
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
603

604
605
606
607
608
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
609
    engine_args = EngineArgs(
610
        model=model_name,
611
        max_model_len=8192,
612
        max_num_seqs=2,
613
        limit_mm_per_prompt={modality: 1},
614
615
    )

616
    tokenizer = AutoTokenizer.from_pretrained(model_name)
617
    messages = [[{
618
619
620
621
622
623
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
624
            "text": question
625
        }]
626
    }] for question in questions]
627
628
629
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
630
631
632
633
634

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
635
636


637
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
638
639
640
641
642
643
644
645
646
647
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
648
        limit_mm_per_prompt={modality: 1},
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


673
# Molmo
674
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
675
676
    assert modality == "image"

677
    model_name = "allenai/Molmo-7B-D-0924"
678

679
    engine_args = EngineArgs(
680
        model=model_name,
681
        trust_remote_code=True,
682
        dtype="bfloat16",
683
        limit_mm_per_prompt={modality: 1},
684
    )
685

686
687
688
689
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
690
691
692
693
694

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
695
696


697
# NVLM-D
698
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
699
700
701
702
703
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
704
    engine_args = EngineArgs(
705
706
707
708
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
709
        limit_mm_per_prompt={modality: 1},
710
711
712
713
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
714
    messages = [[{
715
716
        'role': 'user',
        'content': f"<image>\n{question}"
717
    }] for question in questions]
718
719
720
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
721
722
723
724
725

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
726
727


728
729
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
730
731
732
733
734
735
736
737
738
739
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
740
        limit_mm_per_prompt={modality: 1},
741
742
    )

743
744
745
746
747
748
749
750
751
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
752
753
754
755
756
757
758

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


759
# PaliGemma
760
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
761
    assert modality == "image"
762

763
    # PaliGemma has special prompt format for VQA
764
765
766
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
767
        limit_mm_per_prompt={modality: 1},
768
    )
769
770
771
772
773

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
774
775


776
# PaliGemma 2
777
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
778
    assert modality == "image"
779

780
    # PaliGemma 2 has special prompt format for VQA
781
782
783
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
784
        limit_mm_per_prompt={modality: 1},
785
    )
786
787
788
789
790

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
791
792


793
# Phi-3-Vision
794
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
795
796
    assert modality == "image"

797
798
799
800
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
801

802
803
804
805
806
807
808
809
810
811
812
813
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
814
    engine_args = EngineArgs(
815
816
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
817
        max_model_len=4096,
818
        max_num_seqs=2,
819
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
820
        mm_processor_kwargs={"num_crops": 16},
821
        limit_mm_per_prompt={modality: 1},
822
    )
823
824
825
826
827

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
828
829


830
# Phi-4-multimodal-instruct
831
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
832
833
834
835
836
837
838
839
840
841
842
843
844
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
845
    engine_args = EngineArgs(
846
847
        model=model_path,
        trust_remote_code=True,
848
        max_model_len=5120,
849
        max_num_seqs=2,
850
        max_num_batched_tokens=12800,
851
852
        enable_lora=True,
        max_lora_rank=320,
853
854
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
855
        limit_mm_per_prompt={modality: 1},
856
857
    )

858
859
860
861
862
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
863
864


865
# Pixtral HF-format
866
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
867
868
869
870
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

871
    # NOTE: Need L40 (or equivalent) to avoid OOM
872
    engine_args = EngineArgs(
873
        model=model_name,
874
        max_model_len=6144,
875
        max_num_seqs=2,
876
        limit_mm_per_prompt={modality: 1},
877
878
    )

879
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
880
881
882
883
884

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
885
886


887
# Qwen
888
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
889
890
    assert modality == "image"

891
    engine_args = EngineArgs(
892
        model="Qwen/Qwen-VL",
893
        trust_remote_code=True,
894
895
        max_model_len=1024,
        max_num_seqs=2,
896
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
897
        limit_mm_per_prompt={modality: 1},
898
899
    )

900
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
901
902
903
904
905

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
906
907


908
# Qwen2-VL
909
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
910

911
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
912

913
    engine_args = EngineArgs(
914
        model=model_name,
915
916
917
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
918
        mm_processor_kwargs={
919
920
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
921
        },
922
        limit_mm_per_prompt={modality: 1},
923
    )
924

925
926
927
928
929
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

930
931
932
933
934
935
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
936
937
938
939
940

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
941
942


Roger Wang's avatar
Roger Wang committed
943
# Qwen2.5-VL
944
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
945
946
947

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

948
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
949
950
951
952
953
954
955
956
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
957
        limit_mm_per_prompt={modality: 1},
Roger Wang's avatar
Roger Wang committed
958
959
960
961
962
963
964
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

965
966
967
968
969
970
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
971
972
973
974
975

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
976
977


978
979
980
981
982
983
984
985
986
987
988
989
990
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
991
        limit_mm_per_prompt={modality: 1},
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")

    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
1024
        limit_mm_per_prompt={modality: 1},
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


1049
model_example_map = {
1050
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1051
    "aya_vision": run_aya_vision,
1052
1053
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
1054
    "deepseek_vl_v2": run_deepseek_vl2,
1055
    "florence2": run_florence2,
1056
    "fuyu": run_fuyu,
1057
    "gemma3": run_gemma3,
1058
1059
1060
1061
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
1062
    "kimi_vl": run_kimi_vl,
1063
1064
    "llava": run_llava,
    "llava-next": run_llava_next,
1065
    "llava-next-video": run_llava_next_video,
1066
    "llava-onevision": run_llava_onevision,
1067
    "mantis": run_mantis,
1068
    "minicpmo": run_minicpmo,
1069
    "minicpmv": run_minicpmv,
1070
    "mistral3": run_mistral3,
1071
    "mllama": run_mllama,
1072
    "llama4": run_llama4,
1073
    "molmo": run_molmo,
1074
    "NVLM_D": run_nvlm_d,
1075
    "ovis": run_ovis,
1076
1077
1078
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1079
    "phi4_mm": run_phi4mm,
1080
    "pixtral_hf": run_pixtral_hf,
1081
    "qwen_vl": run_qwen_vl,
1082
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1083
    "qwen2_5_vl": run_qwen2_5_vl,
1084
    "qwen2_5_omni": run_qwen2_5_omni,
1085
    "skywork_chat": run_skyworkr1v,
1086
    "smolvlm": run_smolvlm,
1087
1088
1089
}


1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
1101
1102
1103
1104
1105
1106
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1107
1108
1109

        return {
            "data": image,
1110
            "questions": img_questions,
1111
1112
1113
1114
        }

    if args.modality == "video":
        # Input video and question
1115
        video = VideoAsset(name="baby_reading",
1116
                           num_frames=args.num_frames).np_ndarrays
1117
        vid_questions = ["Why is this video funny?"]
1118
1119
1120

        return {
            "data": video,
1121
            "questions": vid_questions,
1122
1123
1124
1125
1126
1127
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1128
1129
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1149
            "prompt": prompts[i % len(prompts)],
1150
1151
1152
1153
1154
1155
1156
1157
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1225
1226
1227
1228
1229
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1230
1231
1232
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1233
    questions = mm_input["questions"]
1234

1235
1236
    req_data = model_example_map[model](questions, modality)

1237
1238
1239
1240
1241
1242
1243
1244
1245
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1246
1247
    llm = LLM(**engine_args)

1248
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1249
1250
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1251
    ]
1252
1253
1254

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1255
1256
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1257
                                     stop_token_ids=req_data.stop_token_ids)
1258
1259
1260
1261
1262

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1263
            "prompt": prompts[0],
1264
            "multi_modal_data": {
1265
                modality: data
1266
1267
1268
1269
            },
        }
    else:
        # Batch inference
1270
1271
1272
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1273
                                        args.num_prompts, data, prompts,
1274
1275
1276
1277
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1278
                "prompt": prompts[i % len(prompts)],
1279
1280
1281
                "multi_modal_data": {
                    modality: data
                },
1282
            } for i in range(args.num_prompts)]
1283

1284
1285
1286
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1287

1288
1289
1290
1291
1292
1293
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1294

1295
    print("-" * 50)
1296
1297
1298
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1299
        print("-" * 50)
1300
1301
1302


if __name__ == "__main__":
1303
    args = parse_args()
1304
    main(args)