vision_language.py 37.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
from vllm.multimodal.image import convert_image_mode
23
24
from vllm.utils import FlexibleArgumentParser

25
26
27
28
29
30
31
32

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


33
34
35
36
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

37

38
# Aria
39
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
40
41
42
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

43
    # NOTE: Need L40 (or equivalent) to avoid OOM
44
45
46
47
48
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
49
        limit_mm_per_prompt={modality: 1},
50
    )
51

52
53
54
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
55
56

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
57
58
59
60
61
62

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
63
64


Jennifer Zhao's avatar
Jennifer Zhao committed
65
66
67
68
69
70
71
72
73
74
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
75
        limit_mm_per_prompt={modality: 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
76
77
78
79
80
81
82
83
84
85
86
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


87
# BLIP-2
88
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
89
90
91
92
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
93
    prompts = [f"Question: {question} Answer:" for question in questions]
94
    engine_args = EngineArgs(
95
        model="Salesforce/blip2-opt-6.7b",
96
        limit_mm_per_prompt={modality: 1},
97
98
99
100
101
102
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
103
104
105


# Chameleon
106
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
107
108
    assert modality == "image"

109
    prompts = [f"{question}<image>" for question in questions]
110
111
112
113
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
114
        limit_mm_per_prompt={modality: 1},
115
116
117
118
119
120
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
121
122


123
# Deepseek-VL2
124
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
125
126
    assert modality == "image"

127
    model_name = "deepseek-ai/deepseek-vl2-tiny"
128

129
130
131
132
133
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
134
        limit_mm_per_prompt={modality: 1},
135
    )
136

137
138
139
140
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
141
142
143
144
145

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
146
147


148
# Florence2
149
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
150
151
    assert modality == "image"

152
153
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
154
        tokenizer="Isotr0py/Florence-2-tokenizer",
155
156
        max_model_len=4096,
        max_num_seqs=2,
157
158
        trust_remote_code=True,
        dtype="bfloat16",
159
        limit_mm_per_prompt={modality: 1},
160
    )
161

162
163
164
165
166
167
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
168
169


170
# Fuyu
171
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
172
173
    assert modality == "image"

174
    prompts = [f"{question}\n" for question in questions]
175
176
177
178
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
179
        limit_mm_per_prompt={modality: 1},
180
181
182
183
184
185
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
186
187


188
# Gemma 3
189
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
190
191
192
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

193
    engine_args = EngineArgs(
194
195
196
197
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
198
        limit_mm_per_prompt={modality: 1},
199
    )
200
201
202
203

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
204
205
206
207
208

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
209
210


211
# GLM-4v
212
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
213
214
215
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

216
217
218
219
220
221
222
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
223
        limit_mm_per_prompt={modality: 1},
224
    )
225

226
227
228
229
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
230

231
    stop_token_ids = [151329, 151336, 151338]
232
233
234
235
236
237

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
238
239
240


# H2OVL-Mississippi
241
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
242
243
    assert modality == "image"

244
    model_name = "h2oai/h2ovl-mississippi-800m"
245

246
    engine_args = EngineArgs(
247
248
249
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
250
        limit_mm_per_prompt={modality: 1},
251
252
253
254
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
255
256
257
258
259
260
261
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
262
263

    # Stop tokens for H2OVL-Mississippi
264
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
265
    stop_token_ids = [tokenizer.eos_token_id]
266
267
268
269
270
271

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
272
273
274


# Idefics3-8B-Llama3
275
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
276
277
278
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

279
    engine_args = EngineArgs(
280
281
282
283
284
285
286
287
288
289
290
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
291
        limit_mm_per_prompt={modality: 1},
292
    )
293
    prompts = [(
294
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
295
    ) for question in questions]
296
297
298
299
300

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
301
302


303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
318
        limit_mm_per_prompt={modality: 1},
319
320
321
322
323
324
325
326
327
328
329
330
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


331
# InternVL
332
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
333
334
335
336
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

337
    engine_args = EngineArgs(
338
339
340
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
341
        limit_mm_per_prompt={modality: 1},
342
343
344
345
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
346
347
348
349
350
351
352
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
353
354
355
356
357
358
359

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
360
361
362
363
364
365

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
366
367


368
369
370
371
372
373
374
375
376
377
378
379
380
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>" for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
Cyrus Leung's avatar
Cyrus Leung committed
381
        max_model_len=4096,
382
        limit_mm_per_prompt={modality: 1},
383
384
385
386
387
388
389
390
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


391
# LLaVA-1.5
392
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
393
    assert modality == "image"
394

395
396
397
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
398

399
400
401
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
402
        limit_mm_per_prompt={modality: 1},
403
404
405
406
407
408
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
409
410
411


# LLaVA-1.6/LLaVA-NeXT
412
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
413
    assert modality == "image"
414

415
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
416
417
418
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
419
        limit_mm_per_prompt={modality: 1},
420
421
422
423
424
425
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
426
427
428
429


# LlaVA-NeXT-Video
# Currently only support for video input
430
431
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
432
433
    assert modality == "video"

434
435
436
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
437
438
439
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
440
        max_num_seqs=2,
441
        limit_mm_per_prompt={modality: 1},
442
443
444
445
446
447
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
448
449


450
# LLaVA-OneVision
451
452
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
453
454

    if modality == "video":
455
456
457
458
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
459
460

    elif modality == "image":
461
462
463
464
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
465

466
467
468
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
469
        limit_mm_per_prompt={modality: 1},
470
471
472
473
474
475
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
476
477


478
# Mantis
479
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
480
    assert modality == "image"
481

482
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
483
484
485
486
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
487

488
    engine_args = EngineArgs(
489
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
490
        max_model_len=4096,
491
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
492
        limit_mm_per_prompt={modality: 1},
493
    )
494
    stop_token_ids = [128009]
495
496
497
498
499
500

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
501
502
503


# MiniCPM-V
504
def run_minicpmv_base(questions: list[str], modality: str, model_name):
505
506
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
507
508
509
510
511
512
513

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
514
515
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

516
    # 2.6
517
518
519
520
521
522
523
524
525
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
526
527
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
528
    engine_args = EngineArgs(
529
        model=model_name,
530
531
        max_model_len=4096,
        max_num_seqs=2,
532
        trust_remote_code=True,
533
        limit_mm_per_prompt={modality: 1},
534
    )
535
536
537
538
539
540
541
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

542
    # 2.6 / o2.6
543
544
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
545

546
547
548
549
550
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

551
552
553
554
555
556
557
558
559
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
560
561
562
563
564
565

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
566
567


568
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
569
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
570
571


572
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
573
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
574
575


576
577
578
579
580
581
582
583
584
585
586
587
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
588
        limit_mm_per_prompt={modality: 1},
589
590
591
592
593
594
595
596
597
598
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


599
# LLama 3.2
600
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
601
602
    assert modality == "image"

603
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
604

605
606
607
608
609
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
610
    engine_args = EngineArgs(
611
        model=model_name,
612
        max_model_len=8192,
613
        max_num_seqs=2,
614
        limit_mm_per_prompt={modality: 1},
615
616
    )

617
    tokenizer = AutoTokenizer.from_pretrained(model_name)
618
    messages = [[{
619
620
621
622
623
624
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
625
            "text": question
626
        }]
627
    }] for question in questions]
628
629
630
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
631
632
633
634
635

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
636
637


638
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
639
640
641
642
643
644
645
646
647
648
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
649
        limit_mm_per_prompt={modality: 1},
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


674
# Molmo
675
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
676
677
    assert modality == "image"

678
    model_name = "allenai/Molmo-7B-D-0924"
679

680
    engine_args = EngineArgs(
681
        model=model_name,
682
        trust_remote_code=True,
683
        dtype="bfloat16",
684
        limit_mm_per_prompt={modality: 1},
685
    )
686

687
688
689
690
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
691
692
693
694
695

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
696
697


698
# NVLM-D
699
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
700
701
702
703
704
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
705
    engine_args = EngineArgs(
706
707
708
709
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
710
        limit_mm_per_prompt={modality: 1},
711
712
713
714
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
715
    messages = [[{
716
717
        'role': 'user',
        'content': f"<image>\n{question}"
718
    }] for question in questions]
719
720
721
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
722
723
724
725
726

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
727
728


729
730
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
731
732
733
734
735
736
737
738
739
740
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
741
        limit_mm_per_prompt={modality: 1},
742
743
    )

744
745
746
747
748
749
750
751
752
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
753
754
755
756
757
758
759

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


760
# PaliGemma
761
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
762
    assert modality == "image"
763

764
    # PaliGemma has special prompt format for VQA
765
766
767
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
768
        limit_mm_per_prompt={modality: 1},
769
    )
770
771
772
773
774

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
775
776


777
# PaliGemma 2
778
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
779
    assert modality == "image"
780

781
    # PaliGemma 2 has special prompt format for VQA
782
783
784
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
785
        limit_mm_per_prompt={modality: 1},
786
    )
787
788
789
790
791

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
792
793


794
# Phi-3-Vision
795
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
796
797
    assert modality == "image"

798
799
800
801
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
802

803
804
805
806
807
808
809
810
811
812
813
814
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
815
    engine_args = EngineArgs(
816
817
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
818
        max_model_len=4096,
819
        max_num_seqs=2,
820
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
821
        mm_processor_kwargs={"num_crops": 16},
822
        limit_mm_per_prompt={modality: 1},
823
    )
824
825
826
827
828

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
829
830


831
# Phi-4-multimodal-instruct
832
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
833
834
835
836
837
838
839
840
841
842
843
844
845
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
846
    engine_args = EngineArgs(
847
848
        model=model_path,
        trust_remote_code=True,
849
        max_model_len=5120,
850
        max_num_seqs=2,
851
        max_num_batched_tokens=12800,
852
853
        enable_lora=True,
        max_lora_rank=320,
854
855
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
856
        limit_mm_per_prompt={modality: 1},
857
858
    )

859
860
861
862
863
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
864
865


866
# Pixtral HF-format
867
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
868
869
870
871
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

872
    # NOTE: Need L40 (or equivalent) to avoid OOM
873
    engine_args = EngineArgs(
874
        model=model_name,
875
        max_model_len=6144,
876
        max_num_seqs=2,
877
        limit_mm_per_prompt={modality: 1},
878
879
    )

880
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
881
882
883
884
885

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
886
887


888
# Qwen
889
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
890
891
    assert modality == "image"

892
    engine_args = EngineArgs(
893
        model="Qwen/Qwen-VL",
894
        trust_remote_code=True,
895
896
        max_model_len=1024,
        max_num_seqs=2,
897
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
898
        limit_mm_per_prompt={modality: 1},
899
900
    )

901
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
902
903
904
905
906

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
907
908


909
# Qwen2-VL
910
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
911

912
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
913

914
    engine_args = EngineArgs(
915
        model=model_name,
916
917
918
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
919
        mm_processor_kwargs={
920
921
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
922
        },
923
        limit_mm_per_prompt={modality: 1},
924
    )
925

926
927
928
929
930
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

931
932
933
934
935
936
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
937
938
939
940
941

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
942
943


Roger Wang's avatar
Roger Wang committed
944
# Qwen2.5-VL
945
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
946
947
948

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

949
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
950
951
952
953
954
955
956
957
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
958
        limit_mm_per_prompt={modality: 1},
Roger Wang's avatar
Roger Wang committed
959
960
961
962
963
964
965
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

966
967
968
969
970
971
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
972
973
974
975
976

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
977
978


979
980
981
982
983
984
985
986
987
988
989
990
991
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
992
        limit_mm_per_prompt={modality: 1},
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")

    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
                f"{question}<|im_end|>\n"
                "<|im_start|>assistant\n") for question in questions]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
1025
        limit_mm_per_prompt={modality: 1},
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


1050
model_example_map = {
1051
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
1052
    "aya_vision": run_aya_vision,
1053
1054
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
1055
    "deepseek_vl_v2": run_deepseek_vl2,
1056
    "florence2": run_florence2,
1057
    "fuyu": run_fuyu,
1058
    "gemma3": run_gemma3,
1059
1060
1061
1062
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
1063
    "kimi_vl": run_kimi_vl,
1064
1065
    "llava": run_llava,
    "llava-next": run_llava_next,
1066
    "llava-next-video": run_llava_next_video,
1067
    "llava-onevision": run_llava_onevision,
1068
    "mantis": run_mantis,
1069
    "minicpmo": run_minicpmo,
1070
    "minicpmv": run_minicpmv,
1071
    "mistral3": run_mistral3,
1072
    "mllama": run_mllama,
1073
    "llama4": run_llama4,
1074
    "molmo": run_molmo,
1075
    "NVLM_D": run_nvlm_d,
1076
    "ovis": run_ovis,
1077
1078
1079
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
1080
    "phi4_mm": run_phi4mm,
1081
    "pixtral_hf": run_pixtral_hf,
1082
    "qwen_vl": run_qwen_vl,
1083
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
1084
    "qwen2_5_vl": run_qwen2_5_vl,
1085
    "qwen2_5_omni": run_qwen2_5_omni,
1086
    "skywork_chat": run_skyworkr1v,
1087
    "smolvlm": run_smolvlm,
1088
1089
1090
}


1091
1092
1093
1094
1095
1096
1097
1098
1099
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
1100
1101
        image = convert_image_mode(
            ImageAsset("cherry_blossom").pil_image, "RGB")
1102
1103
1104
1105
1106
1107
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1108
1109
1110

        return {
            "data": image,
1111
            "questions": img_questions,
1112
1113
1114
1115
        }

    if args.modality == "video":
        # Input video and question
1116
        video = VideoAsset(name="baby_reading",
1117
                           num_frames=args.num_frames).np_ndarrays
1118
        vid_questions = ["Why is this video funny?"]
1119
1120
1121

        return {
            "data": video,
1122
            "questions": vid_questions,
1123
1124
1125
1126
1127
1128
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1129
1130
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1150
            "prompt": prompts[i % len(prompts)],
1151
1152
1153
1154
1155
1156
1157
1158
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')
    return parser.parse_args()


1226
1227
1228
1229
1230
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1231
1232
1233
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1234
    questions = mm_input["questions"]
1235

1236
1237
    req_data = model_example_map[model](questions, modality)

1238
1239
1240
1241
1242
1243
1244
1245
1246
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1247
1248
    llm = LLM(**engine_args)

1249
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1250
1251
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1252
    ]
1253
1254
1255

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1256
1257
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1258
                                     stop_token_ids=req_data.stop_token_ids)
1259
1260
1261
1262
1263

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1264
            "prompt": prompts[0],
1265
            "multi_modal_data": {
1266
                modality: data
1267
1268
1269
1270
            },
        }
    else:
        # Batch inference
1271
1272
1273
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1274
                                        args.num_prompts, data, prompts,
1275
1276
1277
1278
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1279
                "prompt": prompts[i % len(prompts)],
1280
1281
1282
                "multi_modal_data": {
                    modality: data
                },
1283
            } for i in range(args.num_prompts)]
1284

1285
1286
1287
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1288

1289
1290
1291
1292
1293
1294
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1295

1296
    print("-" * 50)
1297
1298
1299
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1300
        print("-" * 50)
1301
1302
1303


if __name__ == "__main__":
1304
    args = parse_args()
1305
    main(args)