vision_language.py 35 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
from contextlib import contextmanager
12
13
from dataclasses import asdict
from typing import NamedTuple, Optional
14

15
from huggingface_hub import snapshot_download
16
17
from transformers import AutoTokenizer

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.assets.image import ImageAsset
20
from vllm.assets.video import VideoAsset
21
from vllm.lora.request import LoRARequest
22
23
from vllm.utils import FlexibleArgumentParser

24
25
26
27
28
29
30
31

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


32
33
34
35
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

36

37
# Aria
38
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
39
40
41
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

42
    # NOTE: Need L40 (or equivalent) to avoid OOM
43
44
45
46
47
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
48
        limit_mm_per_prompt={"image": 1},
49
    )
50

51
52
53
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
54
55

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
56
57
58
59
60
61

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
62
63


Jennifer Zhao's avatar
Jennifer Zhao committed
64
65
66
67
68
69
70
71
72
73
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
74
        limit_mm_per_prompt={"image": 1},
Jennifer Zhao's avatar
Jennifer Zhao committed
75
76
77
78
79
80
81
82
83
84
85
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


86
# BLIP-2
87
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
88
89
90
91
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
92
    prompts = [f"Question: {question} Answer:" for question in questions]
93
    engine_args = EngineArgs(
94
        model="Salesforce/blip2-opt-6.7b",
95
        limit_mm_per_prompt={"image": 1},
96
97
98
99
100
101
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
102
103
104


# Chameleon
105
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
106
107
    assert modality == "image"

108
    prompts = [f"{question}<image>" for question in questions]
109
110
111
112
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
113
        limit_mm_per_prompt={"image": 1},
114
115
116
117
118
119
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
120
121


122
# Deepseek-VL2
123
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
124
125
    assert modality == "image"

126
    model_name = "deepseek-ai/deepseek-vl2-tiny"
127

128
129
130
131
132
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133
        limit_mm_per_prompt={"image": 1},
134
    )
135

136
137
138
139
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
140
141
142
143
144

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
145
146


147
# Florence2
148
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
149
150
    assert modality == "image"

151
152
153
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
154
155
        max_model_len=4096,
        max_num_seqs=2,
156
157
        trust_remote_code=True,
        dtype="bfloat16",
158
        limit_mm_per_prompt={"image": 1},
159
    )
160

161
162
163
164
165
166
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
167
168


169
# Fuyu
170
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
171
172
    assert modality == "image"

173
    prompts = [f"{question}\n" for question in questions]
174
175
176
177
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
178
        limit_mm_per_prompt={"image": 1},
179
180
181
182
183
184
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# Gemma 3
188
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

192
    engine_args = EngineArgs(
193
194
195
196
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
197
        limit_mm_per_prompt={"image": 1},
198
    )
199
200
201
202

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
203
204
205
206
207

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
208
209


210
# GLM-4v
211
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
212
213
214
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

215
216
217
218
219
220
221
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222
        limit_mm_per_prompt={"image": 1},
223
    )
224

225
226
227
228
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
229

230
    stop_token_ids = [151329, 151336, 151338]
231
232
233
234
235
236

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
237
238
239


# H2OVL-Mississippi
240
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
241
242
    assert modality == "image"

243
    model_name = "h2oai/h2ovl-mississippi-800m"
244

245
    engine_args = EngineArgs(
246
247
248
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
249
        limit_mm_per_prompt={"image": 1},
250
251
252
253
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
254
255
256
257
258
259
260
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
261
262

    # Stop tokens for H2OVL-Mississippi
263
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
264
    stop_token_ids = [tokenizer.eos_token_id]
265
266
267
268
269
270

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
271
272
273


# Idefics3-8B-Llama3
274
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
275
276
277
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

278
    engine_args = EngineArgs(
279
280
281
282
283
284
285
286
287
288
289
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
290
        limit_mm_per_prompt={"image": 1},
291
    )
292
    prompts = [(
293
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
294
    ) for question in questions]
295
296
297
298
299

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
317
        limit_mm_per_prompt={"image": 1},
318
319
320
321
322
323
324
325
326
327
328
329
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


330
# InternVL
331
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
332
333
334
335
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

336
    engine_args = EngineArgs(
337
338
339
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
340
        limit_mm_per_prompt={"image": 1},
341
342
343
344
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
345
346
347
348
349
350
351
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
352
353
354
355
356
357
358

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
359
360
361
362
363
364

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
365
366


367
# LLaVA-1.5
368
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
369
    assert modality == "image"
370

371
372
373
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
374

375
376
377
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
378
        limit_mm_per_prompt={"image": 1},
379
380
381
382
383
384
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
385
386
387


# LLaVA-1.6/LLaVA-NeXT
388
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
389
    assert modality == "image"
390

391
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
392
393
394
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
395
        limit_mm_per_prompt={"image": 1},
396
397
398
399
400
401
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
402
403
404
405


# LlaVA-NeXT-Video
# Currently only support for video input
406
407
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
408
409
    assert modality == "video"

410
411
412
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
413
414
415
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
416
        max_num_seqs=2,
417
        limit_mm_per_prompt={"image": 1},
418
419
420
421
422
423
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
424
425


426
# LLaVA-OneVision
427
428
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
429
430

    if modality == "video":
431
432
433
434
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
435
436

    elif modality == "image":
437
438
439
440
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
441

442
443
444
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
445
        limit_mm_per_prompt={"image": 1},
446
447
448
449
450
451
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
452
453


454
# Mantis
455
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
456
    assert modality == "image"
457

458
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
459
460
461
462
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
463

464
    engine_args = EngineArgs(
465
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
466
        max_model_len=4096,
467
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
468
        limit_mm_per_prompt={"image": 1},
469
    )
470
    stop_token_ids = [128009]
471
472
473
474
475
476

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
477
478
479


# MiniCPM-V
480
def run_minicpmv_base(questions: list[str], modality: str, model_name):
481
482
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
483
484
485
486
487
488
489

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
490
491
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

492
    # 2.6
493
494
495
496
497
498
499
500
501
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
502
503
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
504
    engine_args = EngineArgs(
505
        model=model_name,
506
507
        max_model_len=4096,
        max_num_seqs=2,
508
        trust_remote_code=True,
509
        limit_mm_per_prompt={"image": 1},
510
    )
511
512
513
514
515
516
517
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

518
    # 2.6 / o2.6
519
520
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
521

522
523
524
525
526
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

527
528
529
530
531
532
533
534
535
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
536
537
538
539
540
541

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
542
543


544
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
545
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
546
547


548
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
549
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
550
551


552
553
554
555
556
557
558
559
560
561
562
563
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
564
        limit_mm_per_prompt={"image": 1},
565
566
567
568
569
570
571
572
573
574
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


575
# LLama 3.2
576
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
577
578
    assert modality == "image"

579
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
580

581
582
583
584
585
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
586
    engine_args = EngineArgs(
587
        model=model_name,
588
        max_model_len=8192,
589
        max_num_seqs=2,
590
        limit_mm_per_prompt={"image": 1},
591
592
    )

593
    tokenizer = AutoTokenizer.from_pretrained(model_name)
594
    messages = [[{
595
596
597
598
599
600
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
601
            "text": question
602
        }]
603
    }] for question in questions]
604
605
606
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
607
608
609
610
611

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
612
613


614
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
615
616
617
618
619
620
621
622
623
624
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
625
        limit_mm_per_prompt={"image": 1},
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


650
# Molmo
651
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
652
653
    assert modality == "image"

654
    model_name = "allenai/Molmo-7B-D-0924"
655

656
    engine_args = EngineArgs(
657
        model=model_name,
658
        trust_remote_code=True,
659
        dtype="bfloat16",
660
        limit_mm_per_prompt={"image": 1},
661
    )
662

663
664
665
666
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
667
668
669
670
671

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
672
673


674
# NVLM-D
675
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
676
677
678
679
680
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
681
    engine_args = EngineArgs(
682
683
684
685
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
686
        limit_mm_per_prompt={"image": 1},
687
688
689
690
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
691
    messages = [[{
692
693
        'role': 'user',
        'content': f"<image>\n{question}"
694
    }] for question in questions]
695
696
697
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
698
699
700
701
702

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
703
704


705
# PaliGemma
706
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
707
    assert modality == "image"
708

709
    # PaliGemma has special prompt format for VQA
710
711
712
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
713
714
        limit_mm_per_prompt={"image": 1},
    )
715
716
717
718
719

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
720
721


722
# PaliGemma 2
723
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
724
    assert modality == "image"
725

726
    # PaliGemma 2 has special prompt format for VQA
727
728
729
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
730
731
        limit_mm_per_prompt={"image": 1},
    )
732
733
734
735
736

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
737
738


739
# Phi-3-Vision
740
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
741
742
    assert modality == "image"

743
744
745
746
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
747

748
749
750
751
752
753
754
755
756
757
758
759
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
760
    engine_args = EngineArgs(
761
762
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
763
        max_model_len=4096,
764
        max_num_seqs=2,
765
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
766
        mm_processor_kwargs={"num_crops": 16},
767
        limit_mm_per_prompt={"image": 1},
768
    )
769
770
771
772
773

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
774
775


776
# Phi-4-multimodal-instruct
777
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
778
779
780
781
782
783
784
785
786
787
788
789
790
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
791
    engine_args = EngineArgs(
792
793
794
795
796
797
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
798
        limit_mm_per_prompt={"image": 1},
799
800
    )

801
802
803
804
805
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
806
807


808
# Pixtral HF-format
809
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
810
811
812
813
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

814
    # NOTE: Need L40 (or equivalent) to avoid OOM
815
    engine_args = EngineArgs(
816
        model=model_name,
817
        max_model_len=6144,
818
        max_num_seqs=2,
819
        limit_mm_per_prompt={"image": 1},
820
821
    )

822
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
823
824
825
826
827

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
828
829


830
# Qwen
831
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
832
833
    assert modality == "image"

834
    engine_args = EngineArgs(
835
        model="Qwen/Qwen-VL",
836
        trust_remote_code=True,
837
838
        max_model_len=1024,
        max_num_seqs=2,
839
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
840
        limit_mm_per_prompt={"image": 1},
841
842
    )

843
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
844
845
846
847
848

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
849
850


851
# Qwen2-VL
852
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
853

854
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
855

856
    engine_args = EngineArgs(
857
        model=model_name,
858
859
860
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
861
        mm_processor_kwargs={
862
863
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
864
        },
865
        limit_mm_per_prompt={"image": 1},
866
    )
867

868
869
870
871
872
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

873
874
875
876
877
878
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
879
880
881
882
883

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
884
885


Roger Wang's avatar
Roger Wang committed
886
# Qwen2.5-VL
887
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
888
889
890

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

891
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
892
893
894
895
896
897
898
899
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
900
        limit_mm_per_prompt={"image": 1},
Roger Wang's avatar
Roger Wang committed
901
902
903
904
905
906
907
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

908
909
910
911
912
913
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
914
915
916
917
918

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
919
920


921
922
923
924
925
926
927
928
929
930
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
931
        limit_mm_per_prompt={"image": 1},
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


956
model_example_map = {
957
    "aria": run_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
958
    "aya_vision": run_aya_vision,
959
960
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
961
    "deepseek_vl_v2": run_deepseek_vl2,
962
    "florence2": run_florence2,
963
    "fuyu": run_fuyu,
964
    "gemma3": run_gemma3,
965
966
967
968
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
969
970
    "llava": run_llava,
    "llava-next": run_llava_next,
971
    "llava-next-video": run_llava_next_video,
972
    "llava-onevision": run_llava_onevision,
973
    "mantis": run_mantis,
974
    "minicpmo": run_minicpmo,
975
    "minicpmv": run_minicpmv,
976
    "mistral3": run_mistral3,
977
    "mllama": run_mllama,
978
    "llama4": run_llama4,
979
    "molmo": run_molmo,
980
    "NVLM_D": run_nvlm_d,
981
982
983
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
984
    "phi4_mm": run_phi4mm,
985
    "pixtral_hf": run_pixtral_hf,
986
    "qwen_vl": run_qwen_vl,
987
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
988
    "qwen2_5_vl": run_qwen2_5_vl,
989
    "skywork_chat": run_skyworkr1v,
990
    "smolvlm": run_smolvlm,
991
992
993
}


994
995
996
997
998
999
1000
1001
1002
1003
1004
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
1005
1006
1007
1008
1009
1010
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
1011
1012
1013

        return {
            "data": image,
1014
            "questions": img_questions,
1015
1016
1017
1018
1019
1020
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
1021
        vid_questions = ["Why is this video funny?"]
1022
1023
1024

        return {
            "data": video,
1025
            "questions": vid_questions,
1026
1027
1028
1029
1030
1031
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


1032
1033
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
1053
            "prompt": prompts[i % len(prompts)],
1054
1055
1056
1057
1058
1059
1060
1061
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


1076
1077
1078
1079
1080
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

1081
1082
1083
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
1084
    questions = mm_input["questions"]
1085

1086
1087
    req_data = model_example_map[model](questions, modality)

1088
1089
1090
1091
1092
1093
1094
1095
1096
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
1097
1098
1099
1100
1101
1102
1103
1104
1105
    llm = LLM(**engine_args)

    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)

1106
    # Don't want to check the flag multiple times, so just hijack `prompts`.
1107
1108
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
1109
    ]
1110
1111
1112

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
1113
1114
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
1115
                                     stop_token_ids=req_data.stop_token_ids)
1116
1117
1118
1119
1120

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
1121
            "prompt": prompts[0],
1122
            "multi_modal_data": {
1123
                modality: data
1124
1125
1126
1127
            },
        }
    else:
        # Batch inference
1128
1129
1130
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
1131
                                        args.num_prompts, data, prompts,
1132
1133
1134
1135
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
1136
                "prompt": prompts[i % len(prompts)],
1137
1138
1139
                "multi_modal_data": {
                    modality: data
                },
1140
            } for i in range(args.num_prompts)]
1141

1142
1143
1144
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
1145

1146
1147
1148
1149
1150
1151
    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )
1152

1153
    print("-" * 50)
1154
1155
1156
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1157
        print("-" * 50)
1158
1159
1160
1161
1162


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
Cyrus Leung's avatar
Cyrus Leung committed
1163
        'vision language models for text generation')
1164
1165
1166
1167
1168
1169
1170
1171
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
1172
                        default=4,
1173
                        help='Number of prompts to run.')
1174
1175
1176
    parser.add_argument('--modality',
                        type=str,
                        default="image",
1177
                        choices=['image', 'video'],
1178
1179
1180
1181
1182
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
1183
1184
1185
1186
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
1187
1188
1189
1190
1191
1192
1193
1194
1195

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
1196
        '--disable-mm-preprocessor-cache',
1197
        action='store_true',
1198
        help='If True, disables caching of multi-modal preprocessor/mapper.')
1199
1200
1201
1202
1203
1204

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

1205
1206
1207
1208
1209
1210
    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')

1211
    args = parser.parse_args()
1212
    main(args)