vision_language.py 28 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
11
import random

12
from huggingface_hub import snapshot_download
13
14
15
16
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
17
from vllm.assets.video import VideoAsset
18
from vllm.lora.request import LoRARequest
19
20
from vllm.utils import FlexibleArgumentParser

21
22
23
24
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

25

26
# Aria
27
def run_aria(questions: list[str], modality: str):
28
29
30
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

31
    # NOTE: Need L40 (or equivalent) to avoid OOM
32
    llm = LLM(model=model_name,
33
34
              max_model_len=4096,
              max_num_seqs=2,
35
              dtype="bfloat16",
36
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
37

38
39
40
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
41
42

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
43
    return llm, prompts, stop_token_ids
44
45
46


# BLIP-2
47
def run_blip2(questions: list[str], modality: str):
48
49
50
51
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
52
    prompts = [f"Question: {question} Answer:" for question in questions]
53
    llm = LLM(model="Salesforce/blip2-opt-2.7b",
54
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
55
    stop_token_ids = None
56
    return llm, prompts, stop_token_ids
57
58
59


# Chameleon
60
def run_chameleon(questions: list[str], modality: str):
61
62
    assert modality == "image"

63
    prompts = [f"{question}<image>" for question in questions]
64
65
    llm = LLM(model="facebook/chameleon-7b",
              max_model_len=4096,
66
              max_num_seqs=2,
67
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
68
    stop_token_ids = None
69
    return llm, prompts, stop_token_ids
70
71


72
# Deepseek-VL2
73
def run_deepseek_vl2(questions: list[str], modality: str):
74
75
    assert modality == "image"

76
    model_name = "deepseek-ai/deepseek-vl2-tiny"
77
78
79
80
81
82
83

    llm = LLM(model=model_name,
              max_model_len=4096,
              max_num_seqs=2,
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})

84
85
86
87
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
88
    stop_token_ids = None
89
    return llm, prompts, stop_token_ids
90
91


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Florence2
def run_florence2(question: str, modality: str):
    assert modality == "image"

    llm = LLM(model="microsoft/Florence-2-large",
              tokenizer="facebook/bart-large",
              max_num_seqs=8,
              trust_remote_code=True,
              dtype="bfloat16",
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

    prompt = "<MORE_DETAILED_CAPTION>"
    stop_token_ids = None
    return llm, prompt, stop_token_ids


108
# Fuyu
109
def run_fuyu(questions: list[str], modality: str):
110
111
    assert modality == "image"

112
    prompts = [f"{question}\n" for question in questions]
113
114
115
    llm = LLM(model="adept/fuyu-8b",
              max_model_len=2048,
              max_num_seqs=2,
116
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
117
    stop_token_ids = None
118
    return llm, prompts, stop_token_ids
119
120


121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Gemma 3
def run_gemma3(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

    llm = LLM(model=model_name,
              max_model_len=2048,
              max_num_seqs=2,
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
    stop_token_ids = None
    return llm, prompts, stop_token_ids


138
# GLM-4v
139
def run_glm4v(questions: list[str], modality: str):
140
141
142
143
144
145
146
147
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

    llm = LLM(model=model_name,
              max_model_len=2048,
              max_num_seqs=2,
              trust_remote_code=True,
              enforce_eager=True,
148
              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
149
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
150

151
152
153
154
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
155

156
    stop_token_ids = [151329, 151336, 151338]
157
    return llm, prompts, stop_token_ids
158
159
160


# H2OVL-Mississippi
161
def run_h2ovl(questions: list[str], modality: str):
162
163
    assert modality == "image"

164
    model_name = "h2oai/h2ovl-mississippi-800m"
165
166
167
168
169

    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
170
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
171
172
173
174
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
175
176
177
178
179
180
181
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
182
183

    # Stop tokens for H2OVL-Mississippi
184
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
185
    stop_token_ids = [tokenizer.eos_token_id]
186
    return llm, prompts, stop_token_ids
187
188
189


# Idefics3-8B-Llama3
190
def run_idefics3(questions: list[str], modality: str):
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
206
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
207
    )
208
    prompts = [(
209
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
210
    ) for question in questions]
211
    stop_token_ids = None
212
    return llm, prompts, stop_token_ids
213
214
215


# InternVL
216
def run_internvl(questions: list[str], modality: str):
217
218
219
220
221
222
223
224
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
225
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
226
227
228
229
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
230
231
232
233
234
235
236
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
237
238
239
240
241
242
243

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
244
    return llm, prompts, stop_token_ids
245
246


247
# LLaVA-1.5
248
def run_llava(questions: list[str], modality: str):
249
    assert modality == "image"
250

251
252
253
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
254

255
256
    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
              max_model_len=4096,
257
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
258
    stop_token_ids = None
259
    return llm, prompts, stop_token_ids
260
261
262


# LLaVA-1.6/LLaVA-NeXT
263
def run_llava_next(questions: list[str], modality: str):
264
    assert modality == "image"
265

266
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
267
268
    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
              max_model_len=8192,
269
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
270
    stop_token_ids = None
271
    return llm, prompts, stop_token_ids
272
273
274
275


# LlaVA-NeXT-Video
# Currently only support for video input
276
def run_llava_next_video(questions: list[str], modality: str):
277
278
    assert modality == "video"

279
280
281
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
282
283
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
              max_model_len=8192,
284
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
285
    stop_token_ids = None
286
    return llm, prompts, stop_token_ids
287
288


289
# LLaVA-OneVision
290
def run_llava_onevision(questions: list[str], modality: str):
291
292

    if modality == "video":
293
294
295
296
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
297
298

    elif modality == "image":
299
300
301
302
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
303
304

    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
305
              max_model_len=16384,
306
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
307
    stop_token_ids = None
308
    return llm, prompts, stop_token_ids
309
310


311
# Mantis
312
def run_mantis(questions: list[str], modality: str):
313
    assert modality == "image"
314

315
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
316
317
318
319
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
320
321

    llm = LLM(
322
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
323
        max_model_len=4096,
324
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
325
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
326
    )
327
    stop_token_ids = [128009]
328
    return llm, prompts, stop_token_ids
329
330
331


# MiniCPM-V
332
def run_minicpmv_base(questions: list[str], modality: str, model_name):
333
334
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
335
336
337
338
339
340
341

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
342
343
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

344
    # 2.6
345
346
347
348
349
350
351
352
353
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
354
355
356
357
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    llm = LLM(
        model=model_name,
358
359
        max_model_len=4096,
        max_num_seqs=2,
360
        trust_remote_code=True,
361
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
362
    )
363
364
365
366
367
368
369
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

370
    # 2.6 / o2.6
371
372
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
373

374
375
376
377
378
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

379
380
381
382
383
384
385
386
387
388
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
    return llm, prompts, stop_token_ids
389
390


391
392
def run_minicpmo(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
393
394


395
396
def run_minicpmv(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
397
398


399
# LLama 3.2
400
def run_mllama(questions: list[str], modality: str):
401
402
    assert modality == "image"

403
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
404

405
406
407
408
409
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
410
411
    llm = LLM(
        model=model_name,
412
413
        max_model_len=4096,
        max_num_seqs=16,
414
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
415
416
    )

417
    tokenizer = AutoTokenizer.from_pretrained(model_name)
418
    messages = [[{
419
420
421
422
423
424
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
425
            "text": question
426
        }]
427
    }] for question in questions]
428
429
430
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
431
    stop_token_ids = None
432
    return llm, prompts, stop_token_ids
433
434


435
# Molmo
436
def run_molmo(questions: list[str], modality: str):
437
438
    assert modality == "image"

439
    model_name = "allenai/Molmo-7B-D-0924"
440

441
    llm = LLM(
442
        model=model_name,
443
        trust_remote_code=True,
444
        dtype="bfloat16",
445
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
446
    )
447

448
449
450
451
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
452
    stop_token_ids = None
453
    return llm, prompts, stop_token_ids
454
455


456
# NVLM-D
457
def run_nvlm_d(questions: list[str], modality: str):
458
459
460
461
462
463
464
465
466
467
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
468
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
469
470
471
472
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
473
    messages = [[{
474
475
        'role': 'user',
        'content': f"<image>\n{question}"
476
    }] for question in questions]
477
478
479
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
480
    stop_token_ids = None
481
    return llm, prompts, stop_token_ids
482
483


484
485
# PaliGemma
def run_paligemma(question: str, modality: str):
486
    assert modality == "image"
487

488
    # PaliGemma has special prompt format for VQA
489
    prompt = ["caption en"]
490
    llm = LLM(model="google/paligemma-3b-mix-224",
491
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
492
493
    stop_token_ids = None
    return llm, prompt, stop_token_ids
494
495


496
497
# PaliGemma 2
def run_paligemma2(question: str, modality: str):
498
    assert modality == "image"
499

500
    # PaliGemma 2 has special prompt format for VQA
501
    prompt = ["caption en"]
502
    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
503
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
504
505
506
507
    stop_token_ids = None
    return llm, prompt, stop_token_ids


508
# Phi-3-Vision
509
def run_phi3v(questions: list[str], modality: str):
510
511
    assert modality == "image"

512
513
514
515
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
516

517
518
519
520
521
522
523
524
525
526
527
528
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
529
    llm = LLM(
530
531
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
532
        max_model_len=4096,
533
        max_num_seqs=2,
534
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
535
        mm_processor_kwargs={"num_crops": 16},
536
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
537
538
    )
    stop_token_ids = None
539
    return llm, prompts, stop_token_ids
540
541


542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
# Phi-4-multimodal-instruct
def run_phi4mm(questions: list[str], modality: str):
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)

    stop_token_ids = None
    return llm, prompts, stop_token_ids


576
# Pixtral HF-format
577
def run_pixtral_hf(questions: list[str], modality: str):
578
579
580
581
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

582
    # NOTE: Need L40 (or equivalent) to avoid OOM
583
584
585
    llm = LLM(
        model=model_name,
        max_model_len=8192,
586
        max_num_seqs=2,
587
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
588
589
    )

590
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
591
    stop_token_ids = None
592
    return llm, prompts, stop_token_ids
593
594


595
# Qwen
596
def run_qwen_vl(questions: list[str], modality: str):
597
598
599
    assert modality == "image"

    llm = LLM(
600
        model="Qwen/Qwen-VL",
601
        trust_remote_code=True,
602
603
        max_model_len=1024,
        max_num_seqs=2,
604
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
605
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
606
607
    )

608
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
609
    stop_token_ids = None
610
    return llm, prompts, stop_token_ids
611
612


613
# Qwen2-VL
614
def run_qwen2_vl(questions: list[str], modality: str):
615

616
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
617

618
619
    llm = LLM(
        model=model_name,
620
621
622
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
623
        mm_processor_kwargs={
624
625
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
626
        },
627
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
628
    )
629

630
631
632
633
634
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

635
636
637
638
639
640
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
641
    stop_token_ids = None
642
    return llm, prompts, stop_token_ids
643
644


Roger Wang's avatar
Roger Wang committed
645
# Qwen2.5-VL
646
def run_qwen2_5_vl(questions: list[str], modality: str):
Roger Wang's avatar
Roger Wang committed
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

667
668
669
670
671
672
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
Roger Wang's avatar
Roger Wang committed
673
    stop_token_ids = None
674
    return llm, prompts, stop_token_ids
Roger Wang's avatar
Roger Wang committed
675
676


677
model_example_map = {
678
679
680
    "aria": run_aria,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
681
    "deepseek_vl_v2": run_deepseek_vl2,
682
    "florence2": run_florence2,
683
    "fuyu": run_fuyu,
684
    "gemma3": run_gemma3,
685
686
687
688
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
689
690
    "llava": run_llava,
    "llava-next": run_llava_next,
691
    "llava-next-video": run_llava_next_video,
692
    "llava-onevision": run_llava_onevision,
693
    "mantis": run_mantis,
694
    "minicpmo": run_minicpmo,
695
    "minicpmv": run_minicpmv,
696
697
    "mllama": run_mllama,
    "molmo": run_molmo,
698
    "NVLM_D": run_nvlm_d,
699
700
701
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
702
    "phi4_mm": run_phi4mm,
703
    "pixtral_hf": run_pixtral_hf,
704
    "qwen_vl": run_qwen_vl,
705
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
706
    "qwen2_5_vl": run_qwen2_5_vl,
707
708
709
}


710
711
712
713
714
715
716
717
718
719
720
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
721
722
723
724
725
726
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
727
728
729

        return {
            "data": image,
730
            "questions": img_questions,
731
732
733
734
735
736
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
737
        vid_questions = ["Why is this video funny?"]
738
739
740

        return {
            "data": video,
741
            "questions": vid_questions,
742
743
744
745
746
747
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


748
749
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
769
            "prompt": prompts[i % len(prompts)],
770
771
772
773
774
775
776
777
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


778
779
780
781
782
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

783
784
785
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
786
    questions = mm_input["questions"]
787

788
789
790
791
792
793
    llm, prompts, stop_token_ids = model_example_map[model](questions,
                                                            modality)
    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = prompts if args.use_different_prompt_per_request else [
        prompts[0]
    ]
794
795
796

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
797
798
799
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
                                     stop_token_ids=stop_token_ids)
800
801
802
803
804

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
805
            "prompt": prompts[0],
806
            "multi_modal_data": {
807
                modality: data
808
809
810
811
            },
        }
    else:
        # Batch inference
812
813
814
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
815
                                        args.num_prompts, data, prompts,
816
817
818
819
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
820
                "prompt": prompts[i % len(prompts)],
821
822
823
                "multi_modal_data": {
                    modality: data
                },
824
            } for i in range(args.num_prompts)]
825
826
827
828
829
830
831

    if args.time_generate:
        import time
        start_time = time.time()
        outputs = llm.generate(inputs, sampling_params=sampling_params)
        elapsed_time = time.time() - start_time
        print("-- generate time = {}".format(elapsed_time))
832

833
834
    else:
        outputs = llm.generate(inputs, sampling_params=sampling_params)
835
836
837
838
839
840
841
842
843

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
Cyrus Leung's avatar
Cyrus Leung committed
844
        'vision language models for text generation')
845
846
847
848
849
850
851
852
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
853
                        default=4,
854
                        help='Number of prompts to run.')
855
856
857
    parser.add_argument('--modality',
                        type=str,
                        default="image",
858
                        choices=['image', 'video'],
859
860
861
862
863
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
864
865
866
867
868
869
870
871
872

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
873
        '--disable-mm-preprocessor-cache',
874
        action='store_true',
875
        help='If True, disables caching of multi-modal preprocessor/mapper.')
876
877
878
879
880
881

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

882
883
884
885
886
887
    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')

888
    args = parser.parse_args()
889
    main(args)