vision_language.py 30.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
import random
11
12
from dataclasses import asdict
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
16
from transformers import AutoTokenizer

17
from vllm import LLM, EngineArgs, SamplingParams
18
from vllm.assets.image import ImageAsset
19
from vllm.assets.video import VideoAsset
20
from vllm.lora.request import LoRARequest
21
22
from vllm.utils import FlexibleArgumentParser

23
24
25
26
27
28
29
30

class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


31
32
33
34
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

35

36
# Aria
37
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
38
39
40
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

41
    # NOTE: Need L40 (or equivalent) to avoid OOM
42
43
44
45
46
47
48
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
49

50
51
52
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
53
54

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
55
56
57
58
59
60

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
61
62
63


# BLIP-2
64
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
65
66
67
68
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
69
    prompts = [f"Question: {question} Answer:" for question in questions]
70
71
72
73
74
75
76
77
78
    engine_args = EngineArgs(
        model="Salesforce/blip2-opt-2.7b",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
79
80
81


# Chameleon
82
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
83
84
    assert modality == "image"

85
    prompts = [f"{question}<image>" for question in questions]
86
87
88
89
90
91
92
93
94
95
96
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
97
98


99
# Deepseek-VL2
100
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
101
102
    assert modality == "image"

103
    model_name = "deepseek-ai/deepseek-vl2-tiny"
104

105
106
107
108
109
110
111
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
    )
112

113
114
115
116
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
117
118
119
120
121

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
122
123


124
# Florence2
125
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
126
127
    assert modality == "image"

128
129
130
131
132
133
134
135
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
        max_num_seqs=8,
        trust_remote_code=True,
        dtype="bfloat16",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
136

137
138
139
140
141
142
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
143
144


145
# Fuyu
146
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
147
148
    assert modality == "image"

149
    prompts = [f"{question}\n" for question in questions]
150
151
152
153
154
155
156
157
158
159
160
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
161
162


163
# Gemma 3
164
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
165
166
167
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

168
    engine_args = EngineArgs(
169
170
171
172
173
174
175
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
176
177
178
179

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
180
181
182
183
184

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
185
186


187
# GLM-4v
188
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
189
190
191
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

192
193
194
195
196
197
198
199
200
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
201

202
203
204
205
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
206

207
    stop_token_ids = [151329, 151336, 151338]
208
209
210
211
212
213

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
214
215
216


# H2OVL-Mississippi
217
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
218
219
    assert modality == "image"

220
    model_name = "h2oai/h2ovl-mississippi-800m"
221

222
    engine_args = EngineArgs(
223
224
225
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
226
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
227
228
229
230
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
231
232
233
234
235
236
237
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
238
239

    # Stop tokens for H2OVL-Mississippi
240
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
241
    stop_token_ids = [tokenizer.eos_token_id]
242
243
244
245
246
247

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
248
249
250


# Idefics3-8B-Llama3
251
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
252
253
254
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

255
    engine_args = EngineArgs(
256
257
258
259
260
261
262
263
264
265
266
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
267
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
268
    )
269
    prompts = [(
270
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
271
    ) for question in questions]
272
273
274
275
276

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
277
278
279


# InternVL
280
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
281
282
283
284
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

285
    engine_args = EngineArgs(
286
287
288
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
289
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
290
291
292
293
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
294
295
296
297
298
299
300
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
301
302
303
304
305
306
307

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
308
309
310
311
312
313

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
314
315


316
# LLaVA-1.5
317
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
318
    assert modality == "image"
319

320
321
322
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
323

324
325
326
327
328
329
330
331
332
333
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
334
335
336


# LLaVA-1.6/LLaVA-NeXT
337
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
338
    assert modality == "image"
339

340
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
341
342
343
344
345
346
347
348
349
350
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
351
352
353
354


# LlaVA-NeXT-Video
# Currently only support for video input
355
356
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
357
358
    assert modality == "video"

359
360
361
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
362
363
364
365
366
367
368
369
370
371
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
372
373


374
# LLaVA-OneVision
375
376
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
377
378

    if modality == "video":
379
380
381
382
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
383
384

    elif modality == "image":
385
386
387
388
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
389

390
391
392
393
394
395
396
397
398
399
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
400
401


402
# Mantis
403
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
404
    assert modality == "image"
405

406
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
407
408
409
410
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
411

412
    engine_args = EngineArgs(
413
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
414
        max_model_len=4096,
415
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
416
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
417
    )
418
    stop_token_ids = [128009]
419
420
421
422
423
424

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
425
426
427


# MiniCPM-V
428
def run_minicpmv_base(questions: list[str], modality: str, model_name):
429
430
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
431
432
433
434
435
436
437

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
438
439
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

440
    # 2.6
441
442
443
444
445
446
447
448
449
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
450
451
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
452
    engine_args = EngineArgs(
453
        model=model_name,
454
455
        max_model_len=4096,
        max_num_seqs=2,
456
        trust_remote_code=True,
457
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
458
    )
459
460
461
462
463
464
465
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

466
    # 2.6 / o2.6
467
468
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
469

470
471
472
473
474
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

475
476
477
478
479
480
481
482
483
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
484
485
486
487
488
489

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
490
491


492
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
493
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
494
495


496
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
497
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
498
499


500
# LLama 3.2
501
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
502
503
    assert modality == "image"

504
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
505

506
507
508
509
510
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
511
    engine_args = EngineArgs(
512
        model=model_name,
513
514
        max_model_len=4096,
        max_num_seqs=16,
515
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
516
517
    )

518
    tokenizer = AutoTokenizer.from_pretrained(model_name)
519
    messages = [[{
520
521
522
523
524
525
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
526
            "text": question
527
        }]
528
    }] for question in questions]
529
530
531
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
532
533
534
535
536

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
537
538


539
# Molmo
540
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
541
542
    assert modality == "image"

543
    model_name = "allenai/Molmo-7B-D-0924"
544

545
    engine_args = EngineArgs(
546
        model=model_name,
547
        trust_remote_code=True,
548
        dtype="bfloat16",
549
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
550
    )
551

552
553
554
555
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
556
557
558
559
560

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
561
562


563
# NVLM-D
564
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
565
566
567
568
569
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
570
    engine_args = EngineArgs(
571
572
573
574
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
575
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
576
577
578
579
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
580
    messages = [[{
581
582
        'role': 'user',
        'content': f"<image>\n{question}"
583
    }] for question in questions]
584
585
586
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
587
588
589
590
591

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
592
593


594
# PaliGemma
595
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
596
    assert modality == "image"
597

598
    # PaliGemma has special prompt format for VQA
599
600
601
602
603
604
605
606
607
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
608
609


610
# PaliGemma 2
611
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
612
    assert modality == "image"
613

614
    # PaliGemma 2 has special prompt format for VQA
615
616
617
618
619
620
621
622
623
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
624
625


626
# Phi-3-Vision
627
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
628
629
    assert modality == "image"

630
631
632
633
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
634

635
636
637
638
639
640
641
642
643
644
645
646
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
647
    engine_args = EngineArgs(
648
649
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
650
        max_model_len=4096,
651
        max_num_seqs=2,
652
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
653
        mm_processor_kwargs={"num_crops": 16},
654
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
655
    )
656
657
658
659
660

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
661
662


663
# Phi-4-multimodal-instruct
664
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
665
666
667
668
669
670
671
672
673
674
675
676
677
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
678
    engine_args = EngineArgs(
679
680
681
682
683
684
685
686
687
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )

688
689
690
691
692
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
693
694


695
# Pixtral HF-format
696
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
697
698
699
700
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

701
    # NOTE: Need L40 (or equivalent) to avoid OOM
702
    engine_args = EngineArgs(
703
704
        model=model_name,
        max_model_len=8192,
705
        max_num_seqs=2,
706
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
707
708
    )

709
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
710
711
712
713
714

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
715
716


717
# Qwen
718
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
719
720
    assert modality == "image"

721
    engine_args = EngineArgs(
722
        model="Qwen/Qwen-VL",
723
        trust_remote_code=True,
724
725
        max_model_len=1024,
        max_num_seqs=2,
726
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
727
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
728
729
    )

730
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
731
732
733
734
735

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
736
737


738
# Qwen2-VL
739
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
740

741
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
742

743
    engine_args = EngineArgs(
744
        model=model_name,
745
746
747
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
748
        mm_processor_kwargs={
749
750
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
751
        },
752
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
753
    )
754

755
756
757
758
759
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

760
761
762
763
764
765
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
766
767
768
769
770

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
771
772


Roger Wang's avatar
Roger Wang committed
773
# Qwen2.5-VL
774
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
775
776
777

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

778
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

795
796
797
798
799
800
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
801
802
803
804
805

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
Roger Wang's avatar
Roger Wang committed
806
807


808
model_example_map = {
809
810
811
    "aria": run_aria,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
812
    "deepseek_vl_v2": run_deepseek_vl2,
813
    "florence2": run_florence2,
814
    "fuyu": run_fuyu,
815
    "gemma3": run_gemma3,
816
817
818
819
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
820
821
    "llava": run_llava,
    "llava-next": run_llava_next,
822
    "llava-next-video": run_llava_next_video,
823
    "llava-onevision": run_llava_onevision,
824
    "mantis": run_mantis,
825
    "minicpmo": run_minicpmo,
826
    "minicpmv": run_minicpmv,
827
828
    "mllama": run_mllama,
    "molmo": run_molmo,
829
    "NVLM_D": run_nvlm_d,
830
831
832
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
833
    "phi4_mm": run_phi4mm,
834
    "pixtral_hf": run_pixtral_hf,
835
    "qwen_vl": run_qwen_vl,
836
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
837
    "qwen2_5_vl": run_qwen2_5_vl,
838
839
840
}


841
842
843
844
845
846
847
848
849
850
851
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
852
853
854
855
856
857
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
858
859
860

        return {
            "data": image,
861
            "questions": img_questions,
862
863
864
865
866
867
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
868
        vid_questions = ["Why is this video funny?"]
869
870
871

        return {
            "data": video,
872
            "questions": vid_questions,
873
874
875
876
877
878
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


879
880
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
900
            "prompt": prompts[i % len(prompts)],
901
902
903
904
905
906
907
908
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


909
910
911
912
913
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

914
915
916
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
917
    questions = mm_input["questions"]
918

919
920
921
922
923
924
925
926
927
928
929
930
    req_data = model_example_map[model](questions, modality)

    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)

931
    # Don't want to check the flag multiple times, so just hijack `prompts`.
932
933
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
934
    ]
935
936
937

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
938
939
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
940
                                     stop_token_ids=req_data.stop_token_ids)
941
942
943
944
945

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
946
            "prompt": prompts[0],
947
            "multi_modal_data": {
948
                modality: data
949
950
951
952
            },
        }
    else:
        # Batch inference
953
954
955
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
956
                                        args.num_prompts, data, prompts,
957
958
959
960
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
961
                "prompt": prompts[i % len(prompts)],
962
963
964
                "multi_modal_data": {
                    modality: data
                },
965
            } for i in range(args.num_prompts)]
966
967
968
969
970
971
972

    if args.time_generate:
        import time
        start_time = time.time()
        outputs = llm.generate(inputs, sampling_params=sampling_params)
        elapsed_time = time.time() - start_time
        print("-- generate time = {}".format(elapsed_time))
973

974
975
    else:
        outputs = llm.generate(inputs, sampling_params=sampling_params)
976
977
978
979
980
981
982
983
984

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
Cyrus Leung's avatar
Cyrus Leung committed
985
        'vision language models for text generation')
986
987
988
989
990
991
992
993
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
994
                        default=4,
995
                        help='Number of prompts to run.')
996
997
998
    parser.add_argument('--modality',
                        type=str,
                        default="image",
999
                        choices=['image', 'video'],
1000
1001
1002
1003
1004
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
1005
1006
1007
1008
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
1009
1010
1011
1012
1013
1014
1015
1016
1017

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
1018
        '--disable-mm-preprocessor-cache',
1019
        action='store_true',
1020
        help='If True, disables caching of multi-modal preprocessor/mapper.')
1021
1022
1023
1024
1025
1026

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

1027
1028
1029
1030
1031
1032
    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')

1033
    args = parser.parse_args()
1034
    main(args)