"vllm/vscode:/vscode.git/clone" did not exist on "b5e8d01ebde8ea1a2d82d014d62a96a9e5fd1e2a"
vision_language.py 28.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""
Cyrus Leung's avatar
Cyrus Leung committed
3
4
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
5
6
7
8

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
9
import os
10
11
import random

12
from huggingface_hub import snapshot_download
13
14
15
16
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
17
from vllm.assets.video import VideoAsset
18
from vllm.lora.request import LoRARequest
19
20
from vllm.utils import FlexibleArgumentParser

21
22
23
24
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

25

26
# Aria
27
def run_aria(questions: list[str], modality: str):
28
29
30
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

31
    # NOTE: Need L40 (or equivalent) to avoid OOM
32
    llm = LLM(model=model_name,
33
34
              max_model_len=4096,
              max_num_seqs=2,
35
              dtype="bfloat16",
36
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
37

38
39
40
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
41
42

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
43
    return llm, prompts, stop_token_ids
44
45
46


# BLIP-2
47
def run_blip2(questions: list[str], modality: str):
48
49
50
51
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
52
    prompts = [f"Question: {question} Answer:" for question in questions]
53
    llm = LLM(model="Salesforce/blip2-opt-2.7b",
54
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
55
    stop_token_ids = None
56
    return llm, prompts, stop_token_ids
57
58
59


# Chameleon
60
def run_chameleon(questions: list[str], modality: str):
61
62
    assert modality == "image"

63
    prompts = [f"{question}<image>" for question in questions]
64
65
    llm = LLM(model="facebook/chameleon-7b",
              max_model_len=4096,
66
              max_num_seqs=2,
67
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
68
    stop_token_ids = None
69
    return llm, prompts, stop_token_ids
70
71


72
# Deepseek-VL2
73
def run_deepseek_vl2(questions: list[str], modality: str):
74
75
    assert modality == "image"

76
    model_name = "deepseek-ai/deepseek-vl2-tiny"
77
78
79
80
81
82
83

    llm = LLM(model=model_name,
              max_model_len=4096,
              max_num_seqs=2,
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})

84
85
86
87
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
88
    stop_token_ids = None
89
    return llm, prompts, stop_token_ids
90
91


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Florence2
def run_florence2(question: str, modality: str):
    assert modality == "image"

    llm = LLM(model="microsoft/Florence-2-large",
              tokenizer="facebook/bart-large",
              max_num_seqs=8,
              trust_remote_code=True,
              dtype="bfloat16",
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

    prompt = "<MORE_DETAILED_CAPTION>"
    stop_token_ids = None
    return llm, prompt, stop_token_ids


108
# Fuyu
109
def run_fuyu(questions: list[str], modality: str):
110
111
    assert modality == "image"

112
    prompts = [f"{question}\n" for question in questions]
113
114
115
    llm = LLM(model="adept/fuyu-8b",
              max_model_len=2048,
              max_num_seqs=2,
116
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
117
    stop_token_ids = None
118
    return llm, prompts, stop_token_ids
119
120


121
122
123
124
125
# Gemma 3
def run_gemma3(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

126
127
128
129
130
131
132
133
    llm = LLM(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
134
135
136
137
138
139
140
141

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
    stop_token_ids = None
    return llm, prompts, stop_token_ids


142
# GLM-4v
143
def run_glm4v(questions: list[str], modality: str):
144
145
146
147
148
149
150
151
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

    llm = LLM(model=model_name,
              max_model_len=2048,
              max_num_seqs=2,
              trust_remote_code=True,
              enforce_eager=True,
152
              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
153
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
154

155
156
157
158
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]
159

160
    stop_token_ids = [151329, 151336, 151338]
161
    return llm, prompts, stop_token_ids
162
163
164


# H2OVL-Mississippi
165
def run_h2ovl(questions: list[str], modality: str):
166
167
    assert modality == "image"

168
    model_name = "h2oai/h2ovl-mississippi-800m"
169
170
171
172
173

    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
174
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
175
176
177
178
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
179
180
181
182
183
184
185
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
186
187

    # Stop tokens for H2OVL-Mississippi
188
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
189
    stop_token_ids = [tokenizer.eos_token_id]
190
    return llm, prompts, stop_token_ids
191
192
193


# Idefics3-8B-Llama3
194
def run_idefics3(questions: list[str], modality: str):
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    llm = LLM(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
210
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
211
    )
212
    prompts = [(
213
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
214
    ) for question in questions]
215
    stop_token_ids = None
216
    return llm, prompts, stop_token_ids
217
218
219


# InternVL
220
def run_internvl(questions: list[str], modality: str):
221
222
223
224
225
226
227
228
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
229
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
230
231
232
233
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
234
235
236
237
238
239
240
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
241
242
243
244
245
246
247

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
248
    return llm, prompts, stop_token_ids
249
250


251
# LLaVA-1.5
252
def run_llava(questions: list[str], modality: str):
253
    assert modality == "image"
254

255
256
257
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
258

259
260
    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
              max_model_len=4096,
261
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
262
    stop_token_ids = None
263
    return llm, prompts, stop_token_ids
264
265
266


# LLaVA-1.6/LLaVA-NeXT
267
def run_llava_next(questions: list[str], modality: str):
268
    assert modality == "image"
269

270
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
271
272
    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
              max_model_len=8192,
273
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
274
    stop_token_ids = None
275
    return llm, prompts, stop_token_ids
276
277
278
279


# LlaVA-NeXT-Video
# Currently only support for video input
280
def run_llava_next_video(questions: list[str], modality: str):
281
282
    assert modality == "video"

283
284
285
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
286
287
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
              max_model_len=8192,
288
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
289
    stop_token_ids = None
290
    return llm, prompts, stop_token_ids
291
292


293
# LLaVA-OneVision
294
def run_llava_onevision(questions: list[str], modality: str):
295
296

    if modality == "video":
297
298
299
300
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
301
302

    elif modality == "image":
303
304
305
306
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]
307
308

    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
309
              max_model_len=16384,
310
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
311
    stop_token_ids = None
312
    return llm, prompts, stop_token_ids
313
314


315
# Mantis
316
def run_mantis(questions: list[str], modality: str):
317
    assert modality == "image"
318

319
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
320
321
322
323
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]
324
325

    llm = LLM(
326
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
327
        max_model_len=4096,
328
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
329
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
330
    )
331
    stop_token_ids = [128009]
332
    return llm, prompts, stop_token_ids
333
334
335


# MiniCPM-V
336
def run_minicpmv_base(questions: list[str], modality: str, model_name):
337
338
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
339
340
341
342
343
344
345

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
346
347
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

348
    # 2.6
349
350
351
352
353
354
355
356
357
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
358
359
360
361
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    llm = LLM(
        model=model_name,
362
363
        max_model_len=4096,
        max_num_seqs=2,
364
        trust_remote_code=True,
365
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
366
    )
367
368
369
370
371
372
373
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

374
    # 2.6 / o2.6
375
376
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
377

378
379
380
381
382
    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

383
384
385
386
387
388
389
390
391
392
    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
    return llm, prompts, stop_token_ids
393
394


395
396
def run_minicpmo(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
397
398


399
400
def run_minicpmv(questions: list[str], modality: str):
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
401
402


403
# LLama 3.2
404
def run_mllama(questions: list[str], modality: str):
405
406
    assert modality == "image"

407
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
408

409
410
411
412
413
    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
414
415
    llm = LLM(
        model=model_name,
416
417
        max_model_len=4096,
        max_num_seqs=16,
418
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
419
420
    )

421
    tokenizer = AutoTokenizer.from_pretrained(model_name)
422
    messages = [[{
423
424
425
426
427
428
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
429
            "text": question
430
        }]
431
    }] for question in questions]
432
433
434
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
435
    stop_token_ids = None
436
    return llm, prompts, stop_token_ids
437
438


439
# Molmo
440
def run_molmo(questions: list[str], modality: str):
441
442
    assert modality == "image"

443
    model_name = "allenai/Molmo-7B-D-0924"
444

445
    llm = LLM(
446
        model=model_name,
447
        trust_remote_code=True,
448
        dtype="bfloat16",
449
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
450
    )
451

452
453
454
455
    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
456
    stop_token_ids = None
457
    return llm, prompts, stop_token_ids
458
459


460
# NVLM-D
461
def run_nvlm_d(questions: list[str], modality: str):
462
463
464
465
466
467
468
469
470
471
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
472
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
473
474
475
476
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
477
    messages = [[{
478
479
        'role': 'user',
        'content': f"<image>\n{question}"
480
    }] for question in questions]
481
482
483
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
484
    stop_token_ids = None
485
    return llm, prompts, stop_token_ids
486
487


488
489
# PaliGemma
def run_paligemma(question: str, modality: str):
490
    assert modality == "image"
491

492
    # PaliGemma has special prompt format for VQA
493
    prompt = ["caption en"]
494
    llm = LLM(model="google/paligemma-3b-mix-224",
495
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
496
497
    stop_token_ids = None
    return llm, prompt, stop_token_ids
498
499


500
501
# PaliGemma 2
def run_paligemma2(question: str, modality: str):
502
    assert modality == "image"
503

504
    # PaliGemma 2 has special prompt format for VQA
505
    prompt = ["caption en"]
506
    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
507
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
508
509
510
511
    stop_token_ids = None
    return llm, prompt, stop_token_ids


512
# Phi-3-Vision
513
def run_phi3v(questions: list[str], modality: str):
514
515
    assert modality == "image"

516
517
518
519
    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]
520

521
522
523
524
525
526
527
528
529
530
531
532
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
533
    llm = LLM(
534
535
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
536
        max_model_len=4096,
537
        max_num_seqs=2,
538
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
539
        mm_processor_kwargs={"num_crops": 16},
540
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
541
542
    )
    stop_token_ids = None
543
    return llm, prompts, stop_token_ids
544
545


546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
# Phi-4-multimodal-instruct
def run_phi4mm(questions: list[str], modality: str):
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)

    stop_token_ids = None
    return llm, prompts, stop_token_ids


580
# Pixtral HF-format
581
def run_pixtral_hf(questions: list[str], modality: str):
582
583
584
585
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

586
    # NOTE: Need L40 (or equivalent) to avoid OOM
587
588
589
    llm = LLM(
        model=model_name,
        max_model_len=8192,
590
        max_num_seqs=2,
591
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
592
593
    )

594
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
595
    stop_token_ids = None
596
    return llm, prompts, stop_token_ids
597
598


599
# Qwen
600
def run_qwen_vl(questions: list[str], modality: str):
601
602
603
    assert modality == "image"

    llm = LLM(
604
        model="Qwen/Qwen-VL",
605
        trust_remote_code=True,
606
607
        max_model_len=1024,
        max_num_seqs=2,
608
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
609
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
610
611
    )

612
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
613
    stop_token_ids = None
614
    return llm, prompts, stop_token_ids
615
616


617
# Qwen2-VL
618
def run_qwen2_vl(questions: list[str], modality: str):
619

620
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
621

622
623
    llm = LLM(
        model=model_name,
624
625
626
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
627
        mm_processor_kwargs={
628
629
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
630
        },
631
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
632
    )
633

634
635
636
637
638
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

639
640
641
642
643
644
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
645
    stop_token_ids = None
646
    return llm, prompts, stop_token_ids
647
648


Roger Wang's avatar
Roger Wang committed
649
# Qwen2.5-VL
650
def run_qwen2_5_vl(questions: list[str], modality: str):
Roger Wang's avatar
Roger Wang committed
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

    llm = LLM(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

671
672
673
674
675
676
    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
Roger Wang's avatar
Roger Wang committed
677
    stop_token_ids = None
678
    return llm, prompts, stop_token_ids
Roger Wang's avatar
Roger Wang committed
679
680


681
model_example_map = {
682
683
684
    "aria": run_aria,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
685
    "deepseek_vl_v2": run_deepseek_vl2,
686
    "florence2": run_florence2,
687
    "fuyu": run_fuyu,
688
    "gemma3": run_gemma3,
689
690
691
692
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
693
694
    "llava": run_llava,
    "llava-next": run_llava_next,
695
    "llava-next-video": run_llava_next_video,
696
    "llava-onevision": run_llava_onevision,
697
    "mantis": run_mantis,
698
    "minicpmo": run_minicpmo,
699
    "minicpmv": run_minicpmv,
700
701
    "mllama": run_mllama,
    "molmo": run_molmo,
702
    "NVLM_D": run_nvlm_d,
703
704
705
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
706
    "phi4_mm": run_phi4mm,
707
    "pixtral_hf": run_pixtral_hf,
708
    "qwen_vl": run_qwen_vl,
709
    "qwen2_vl": run_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
710
    "qwen2_5_vl": run_qwen2_5_vl,
711
712
713
}


714
715
716
717
718
719
720
721
722
723
724
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
725
726
727
728
729
730
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]
731
732
733

        return {
            "data": image,
734
            "questions": img_questions,
735
736
737
738
739
740
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
741
        vid_questions = ["Why is this video funny?"]
742
743
744

        return {
            "data": video,
745
            "questions": vid_questions,
746
747
748
749
750
751
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


752
753
def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
773
            "prompt": prompts[i % len(prompts)],
774
775
776
777
778
779
780
781
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


782
783
784
785
786
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

787
788
789
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
790
    questions = mm_input["questions"]
791

792
793
794
795
796
797
    llm, prompts, stop_token_ids = model_example_map[model](questions,
                                                            modality)
    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = prompts if args.use_different_prompt_per_request else [
        prompts[0]
    ]
798
799
800

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
801
802
803
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
                                     stop_token_ids=stop_token_ids)
804
805
806
807
808

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
809
            "prompt": prompts[0],
810
            "multi_modal_data": {
811
                modality: data
812
813
814
815
            },
        }
    else:
        # Batch inference
816
817
818
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
819
                                        args.num_prompts, data, prompts,
820
821
822
823
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
824
                "prompt": prompts[i % len(prompts)],
825
826
827
                "multi_modal_data": {
                    modality: data
                },
828
            } for i in range(args.num_prompts)]
829
830
831
832
833
834
835

    if args.time_generate:
        import time
        start_time = time.time()
        outputs = llm.generate(inputs, sampling_params=sampling_params)
        elapsed_time = time.time() - start_time
        print("-- generate time = {}".format(elapsed_time))
836

837
838
    else:
        outputs = llm.generate(inputs, sampling_params=sampling_params)
839
840
841
842
843
844
845
846
847

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
Cyrus Leung's avatar
Cyrus Leung committed
848
        'vision language models for text generation')
849
850
851
852
853
854
855
856
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
857
                        default=4,
858
                        help='Number of prompts to run.')
859
860
861
    parser.add_argument('--modality',
                        type=str,
                        default="image",
862
                        choices=['image', 'video'],
863
864
865
866
867
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
868
869
870
871
872
873
874
875
876

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
877
        '--disable-mm-preprocessor-cache',
878
        action='store_true',
879
        help='If True, disables caching of multi-modal preprocessor/mapper.')
880
881
882
883
884
885

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

886
887
888
889
890
891
    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')

892
    args = parser.parse_args()
893
    main(args)