llava_example.py 2.64 KB
Newer Older
1
2
3
4
5
import argparse
import os
import subprocess

import torch
6
from PIL import Image
7
8

from vllm import LLM
9
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
10
11

# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
12
# You can use `.buildkite/download-images.sh` to download them
13
14


15
def run_llava_pixel_values(*, disable_image_processor: bool = False):
16
17
18
19
20
21
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
        image_input_type="pixel_values",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
        image_feature_size=576,
22
        disable_image_processor=disable_image_processor,
23
24
25
26
27
    )

    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

28
29
30
31
    if disable_image_processor:
        image = torch.load("images/stop_sign_pixel_values.pt")
    else:
        image = Image.open("images/stop_sign.jpg")
32
33

    outputs = llm.generate({
34
35
        "prompt": prompt,
        "multi_modal_data": ImagePixelData(image),
36
    })
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


def run_llava_image_features():
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
        image_input_type="image_features",
        image_token_id=32000,
        image_input_shape="1,576,1024",
        image_feature_size=576,
    )

    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

55
    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
56
57

    outputs = llm.generate({
58
59
        "prompt": prompt,
        "multi_modal_data": ImageFeatureData(image),
60
    })
61

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)


def main(args):
    if args.type == "pixel_values":
        run_llava_pixel_values()
    else:
        run_llava_image_features()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Demo on Llava")
    parser.add_argument("--type",
                        type=str,
                        choices=["pixel_values", "image_features"],
                        default="pixel_values",
                        help="image input type")
    args = parser.parse_args()
    # Download from s3
    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
    local_directory = "images"

    # Make sure the local directory exists or create it
    os.makedirs(local_directory, exist_ok=True)

89
90
91
92
93
94
95
96
97
    # Use AWS CLI to sync the directory, assume anonymous access
    subprocess.check_call([
        "aws",
        "s3",
        "sync",
        s3_bucket_path,
        local_directory,
        "--no-sign-request",
    ])
98
    main(args)