vision_language_multi_image.py 21.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
4
5
multi-image input on vision language models for text generation,
using the chat template defined by the model.
6
"""
7
import os
8
from argparse import Namespace
9
from dataclasses import asdict
10
from typing import NamedTuple, Optional
11

12
from huggingface_hub import snapshot_download
13
from PIL.Image import Image
14
from transformers import AutoProcessor, AutoTokenizer
15

16
from vllm import LLM, EngineArgs, SamplingParams
17
from vllm.lora.request import LoRARequest
18
19
20
21
22
23
24
25
26
27
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]


28
class ModelRequestData(NamedTuple):
29
    engine_args: EngineArgs
30
    prompt: str
31
    image_data: list[Image]
32
33
34
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
35
36


37
38
39
40
41
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


42
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
43
    model_name = "rhymes-ai/Aria"
44
45
46
47
48
49
50
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
51
52
53
54
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
55

56
    return ModelRequestData(
57
        engine_args=engine_args,
58
59
60
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
61
    )
62

63

64
65
def load_deepseek_vl2(question: str,
                      image_urls: list[str]) -> ModelRequestData:
66
    model_name = "deepseek-ai/deepseek-vl2-tiny"
67

68
69
70
71
72
73
74
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
75
76
77
78
79
80

    placeholder = "".join(f"image_{i}:<image>\n"
                          for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
81
        engine_args=engine_args,
82
83
84
85
86
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


87
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
88
89
    model_name = "google/gemma-3-4b-it"

90
    engine_args = EngineArgs(
91
92
93
94
95
96
97
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        # Default is False; setting it to True is not supported in V1 yet
        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [{
        "role":
        "user",
        "content": [
            *placeholders,
            {
                "type": "text",
                "text": question
            },
        ],
    }]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    return ModelRequestData(
119
        engine_args=engine_args,
120
121
122
123
124
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


125
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
126
    model_name = "h2oai/h2ovl-mississippi-800m"
127

128
    engine_args = EngineArgs(
129
130
131
132
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
133
        mm_processor_kwargs={"max_dynamic_patch": 4},
134
135
136
137
138
139
140
141
142
143
144
145
146
    )

    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    # Stop tokens for H2OVL-Mississippi
147
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
148
149
150
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
151
        engine_args=engine_args,
152
153
154
155
156
157
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


158
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
159
160
161
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
162
    engine_args = EngineArgs(
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 2 * 364
            },
        },
    )

    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
181
        engine_args=engine_args,
182
183
184
185
186
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


187
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
188
189
    model_name = "OpenGVLab/InternVL2-2B"

190
    engine_args = EngineArgs(
191
192
193
194
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
195
        mm_processor_kwargs={"max_dynamic_patch": 4},
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
    )

    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
211
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
212
213
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
214

215
    return ModelRequestData(
216
        engine_args=engine_args,
217
218
219
220
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
221
222


223
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
224
225
226
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
227
    engine_args = EngineArgs(
228
229
230
231
232
233
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

234
235
    placeholders = "<|image|>" * len(image_urls)
    prompt = f"{placeholders}<|begin_of_text|>{question}"
236
    return ModelRequestData(
237
        engine_args=engine_args,
238
239
240
241
242
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


243
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
244
245
246
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
247
    engine_args = EngineArgs(
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    return ModelRequestData(
267
        engine_args=engine_args,
268
269
270
271
272
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


273
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
274
275
276
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
277
    engine_args = EngineArgs(
278
279
280
281
282
283
284
285
286
287
288
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
289
        engine_args=engine_args,
290
291
292
293
294
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


295
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
296
297
298
299
300
301
302
303
304
305
306
307
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
308
    engine_args = EngineArgs(
309
310
311
312
313
314
315
316
317
318
319
320
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
321
        engine_args=engine_args,
322
323
324
325
326
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


327
328
329
330
331
332
333
334
335
336
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
337
    engine_args = EngineArgs(
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        model=model_path,
        trust_remote_code=True,
        max_model_len=10000,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
    )

    placeholders = "".join(f"<|image_{i}|>"
                           for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
352
        engine_args=engine_args,
353
354
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
355
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
356
357
358
    )


359
def load_qwen_vl_chat(question: str,
360
                      image_urls: list[str]) -> ModelRequestData:
361
    model_name = "Qwen/Qwen-VL-Chat"
362
    engine_args = EngineArgs(
363
364
365
366
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
367
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(f"Picture {i}: <img></img>\n"
                           for i, _ in enumerate(image_urls, start=1))

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=chat_template)

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
391

392
    return ModelRequestData(
393
        engine_args=engine_args,
394
395
396
397
398
399
400
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


401
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
402
403
404
405
406
407
408
409
410
411
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
        print('WARNING: `qwen-vl-utils` not installed, input images will not '
              'be automatically resized. You can enable this functionality by '
              '`pip install qwen-vl-utils`.')
        process_vision_info = None

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

412
    # Tested on L40
413
    engine_args = EngineArgs(
414
415
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
416
        max_num_seqs=5,
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [{
        "role": "system",
        "content": "You are a helpful assistant."
    }, {
        "role":
        "user",
        "content": [
            *placeholders,
            {
                "type": "text",
                "text": question
            },
        ],
    }]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)

447
    return ModelRequestData(
448
        engine_args=engine_args,
449
450
451
        prompt=prompt,
        image_data=image_data,
    )
452
453


454
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
455
456
457
458
459
460
461
462
463
464
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
        print('WARNING: `qwen-vl-utils` not installed, input images will not '
              'be automatically resized. You can enable this functionality by '
              '`pip install qwen-vl-utils`.')
        process_vision_info = None

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

465
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [{
        "role": "system",
        "content": "You are a helpful assistant."
    }, {
        "role":
        "user",
        "content": [
            *placeholders,
            {
                "type": "text",
                "text": question
            },
        ],
    }]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)

    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages,
498
                                            return_video_kwargs=False)
Roger Wang's avatar
Roger Wang committed
499
500

    return ModelRequestData(
501
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
502
503
504
505
506
        prompt=prompt,
        image_data=image_data,
    )


507
model_example_map = {
508
    "aria": load_aria,
509
    "deepseek_vl_v2": load_deepseek_vl2,
510
    "gemma3": load_gemma3,
511
    "h2ovl_chat": load_h2ovl,
512
    "idefics3": load_idefics3,
513
    "internvl_chat": load_internvl,
514
    "mllama": load_mllama,
515
    "NVLM_D": load_nvlm_d,
516
    "phi3_v": load_phi3v,
517
    "phi4_mm": load_phi4mm,
518
519
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
520
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
521
    "qwen2_5_vl": load_qwen2_5_vl,
522
523
524
}


525
526
def run_generate(model, question: str, image_urls: list[str],
                 seed: Optional[int]):
527
    req_data = model_example_map[model](question, image_urls)
528

529
530
531
532
533
534
535
536
537
538
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)

539
540
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
541
                                     stop_token_ids=req_data.stop_token_ids)
542

543
    outputs = llm.generate(
544
        {
545
            "prompt": req_data.prompt,
546
            "multi_modal_data": {
547
                "image": req_data.image_data
548
            },
549
        },
550
        sampling_params=sampling_params)
551
552
553
554
555
556

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


557
558
def run_chat(model: str, question: str, image_urls: list[str],
             seed: Optional[int]):
559
    req_data = model_example_map[model](question, image_urls)
560

561
562
563
564
565
566
567
568
569
570
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)

571
572
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
573
                                     stop_token_ids=req_data.stop_token_ids)
574
    outputs = llm.chat(
575
576
577
578
579
580
581
        [{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
582
                },
583
584
585
586
587
588
589
590
591
                *({
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    },
                } for image_url in image_urls),
            ],
        }],
        sampling_params=sampling_params,
592
        chat_template=req_data.chat_template,
593
    )
594
595
596
597
598
599
600

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


def main(args: Namespace):
601
    model = args.model_type
602
    method = args.method
603
    seed = args.seed
604
605

    if method == "generate":
606
        run_generate(model, QUESTION, IMAGE_URLS, seed)
607
    elif method == "chat":
608
        run_chat(model, QUESTION, IMAGE_URLS, seed)
609
610
611
612
613
614
615
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
Cyrus Leung's avatar
Cyrus Leung committed
616
617
        'vision language models that support multi-image input for text '
        'generation')
618
619
620
621
622
623
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="phi3_v",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
624
625
626
627
628
    parser.add_argument("--method",
                        type=str,
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
629
630
631
632
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
633
634
635

    args = parser.parse_args()
    main(args)