mistral-small.py 5.74 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

Patrick von Platen's avatar
Patrick von Platen committed
4
5
6
7
8
# ruff: noqa
import argparse

from vllm import LLM
from vllm.sampling_params import SamplingParams
9
from vllm.assets.image import ImageAsset
10
from vllm.multimodal.utils import encode_image_url
Patrick von Platen's avatar
Patrick von Platen committed
11

12
# This script is an offline demo for running Mistral-Small-3.1
Patrick von Platen's avatar
Patrick von Platen committed
13
14
15
16
17
18
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
19
# # Mistral format
20
21
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
22
#   --limit-mm-per-prompt.image 4 --max-model-len 16384
23
24
25
#
# # HF format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
26
#   --limit-mm-per-prompt.image 4 --max-model-len 16384
Patrick von Platen's avatar
Patrick von Platen committed
27
28
29
30
31
32
33
34
35
# ```
#
# - Client:
#
# ```bash
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
36
#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
Patrick von Platen's avatar
Patrick von Platen committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#     "messages": [
#       {
#         "role": "user",
#         "content": [
#             {"type" : "text", "text": "Describe this image in detail please."},
#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
#             {"type" : "text", "text": "and this one as well. Answer in French."},
#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
#         ]
#       }
#     ]
#   }'
# ```
#
# Usage:
#     python demo.py simple
#     python demo.py advanced

55
56
57
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
# These scripts have been tested on 2x L40 GPUs

Patrick von Platen's avatar
Patrick von Platen committed
58

59
def run_simple_demo(args: argparse.Namespace):
60
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
Patrick von Platen's avatar
Patrick von Platen committed
61
62
    sampling_params = SamplingParams(max_tokens=8192)

63
64
    llm = LLM(
        model=model_name,
65
66
67
        tokenizer_mode="mistral" if args.format == "mistral" else "hf",
        config_format="mistral" if args.format == "mistral" else "hf",
        load_format="mistral" if args.format == "mistral" else "hf",
Cyrus Leung's avatar
Cyrus Leung committed
68
        limit_mm_per_prompt={"image": 1},
69
70
        max_model_len=4096,
        max_num_seqs=2,
71
        tensor_parallel_size=2,
72
        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
73
    )
Patrick von Platen's avatar
Patrick von Platen committed
74
75
76
77
78

    prompt = "Describe this image in one sentence."

    messages = [
        {
79
            "role": "user",
Patrick von Platen's avatar
Patrick von Platen committed
80
            "content": [
81
                {"type": "text", "text": prompt},
82
                {
83
84
85
86
                    "type": "image_url",
                    "image_url": {
                        "url": encode_image_url(ImageAsset("cherry_blossom").pil_image)
                    },
87
                },
Patrick von Platen's avatar
Patrick von Platen committed
88
89
90
91
            ],
        },
    ]
    outputs = llm.chat(messages, sampling_params=sampling_params)
92
    print("-" * 50)
Patrick von Platen's avatar
Patrick von Platen committed
93
    print(outputs[0].outputs[0].text)
94
    print("-" * 50)
Patrick von Platen's avatar
Patrick von Platen committed
95
96


97
def run_advanced_demo(args: argparse.Namespace):
98
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
99
    max_img_per_msg = 3
Patrick von Platen's avatar
Patrick von Platen committed
100
101
102
103
104
    max_tokens_per_img = 4096

    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
105
106
107
        tokenizer_mode="mistral" if args.format == "mistral" else "hf",
        config_format="mistral" if args.format == "mistral" else "hf",
        load_format="mistral" if args.format == "mistral" else "hf",
Patrick von Platen's avatar
Patrick von Platen committed
108
        limit_mm_per_prompt={"image": max_img_per_msg},
109
        max_model_len=max_img_per_msg * max_tokens_per_img,
110
        tensor_parallel_size=2,
111
        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
Patrick von Platen's avatar
Patrick von Platen committed
112
113
114
115
116
117
118
119
120
121
    )

    prompt = "Describe the following image."

    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
    url_2 = "https://picsum.photos/seed/picsum/200/300"
    url_3 = "https://picsum.photos/id/32/512/512"

    messages = [
        {
122
            "role": "user",
Patrick von Platen's avatar
Patrick von Platen committed
123
            "content": [
124
125
126
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": url_1}},
                {"type": "image_url", "image_url": {"url": url_2}},
Patrick von Platen's avatar
Patrick von Platen committed
127
128
129
130
131
132
133
134
135
136
137
138
139
            ],
        },
        {
            "role": "assistant",
            "content": "The images show nature.",
        },
        {
            "role": "user",
            "content": "More details please and answer only in French!.",
        },
        {
            "role": "user",
            "content": [
140
                {"type": "image_url", "image_url": {"url": url_3}},
Patrick von Platen's avatar
Patrick von Platen committed
141
142
143
144
145
            ],
        },
    ]

    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
146
    print("-" * 50)
Patrick von Platen's avatar
Patrick von Platen committed
147
    print(outputs[0].outputs[0].text)
148
    print("-" * 50)
Patrick von Platen's avatar
Patrick von Platen committed
149
150


151
def parse_args():
Patrick von Platen's avatar
Patrick von Platen committed
152
    parser = argparse.ArgumentParser(
153
154
        description="Run a demo in simple or advanced mode."
    )
Patrick von Platen's avatar
Patrick von Platen committed
155
156
157
158
159
160
161

    parser.add_argument(
        "mode",
        choices=["simple", "advanced"],
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

162
163
164
165
166
167
    parser.add_argument(
        "--format",
        choices=["mistral", "hf"],
        default="mistral",
        help="Specify the format of the model to load.",
    )
168

169
    parser.add_argument(
170
        "--disable-mm-processor-cache",
171
        action="store_true",
172
        help="If True, disables caching of multi-modal processor.",
173
    )
174
175
    return parser.parse_args()

176

177
178
def main():
    args = parse_args()
Patrick von Platen's avatar
Patrick von Platen committed
179
180
181

    if args.mode == "simple":
        print("Running simple demo...")
182
        run_simple_demo(args)
Patrick von Platen's avatar
Patrick von Platen committed
183
184
    elif args.mode == "advanced":
        print("Running advanced demo...")
185
        run_advanced_demo(args)
Patrick von Platen's avatar
Patrick von Platen committed
186
187
188
189


if __name__ == "__main__":
    main()