vlm_query.ipynb 8.15 KB
Newer Older
1
2
3
4
5
6
7
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
8
9
10
11
12
13
14
15
16
    "# Query Vision Language Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {},
   "source": [
    "## Querying Qwen-VL"
17
18
19
20
21
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
22
   "id": "2",
23
24
25
26
27
28
29
30
31
32
33
34
35
36
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()  # Run this first.\n",
    "\n",
    "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
    "chat_template = \"qwen2-vl\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
37
   "id": "3",
38
   "metadata": {},
39
   "outputs": [],
40
41
42
43
44
45
46
   "source": [
    "# Lets create a prompt.\n",
    "\n",
    "from io import BytesIO\n",
    "import requests\n",
    "from PIL import Image\n",
    "\n",
47
    "from sglang.srt.parser.conversation import chat_templates\n",
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
    "\n",
    "image = Image.open(\n",
    "    BytesIO(\n",
    "        requests.get(\n",
    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
    "        ).content\n",
    "    )\n",
    ")\n",
    "\n",
    "conv = chat_templates[chat_template].copy()\n",
    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
    "conv.append_message(conv.roles[1], \"\")\n",
    "conv.image_data = [image]\n",
    "\n",
    "print(conv.get_prompt())\n",
    "image"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
68
   "id": "4",
69
70
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
71
    "### Query via the offline Engine API"
72
73
74
75
76
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
77
   "id": "5",
78
   "metadata": {},
79
   "outputs": [],
80
81
82
83
84
85
86
87
88
89
90
   "source": [
    "from sglang import Engine\n",
    "\n",
    "llm = Engine(\n",
    "    model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
91
   "id": "6",
92
   "metadata": {},
93
   "outputs": [],
94
95
96
97
98
99
100
   "source": [
    "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
    "print(out[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
101
   "id": "7",
102
103
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
104
    "### Query via the offline Engine API, but send precomputed embeddings"
105
106
107
108
109
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
110
   "id": "8",
111
   "metadata": {},
112
   "outputs": [],
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
   "source": [
    "# Compute the image embeddings using Huggingface.\n",
    "\n",
    "from transformers import AutoProcessor\n",
    "from transformers import Qwen2_5_VLForConditionalGeneration\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
    "vision = (\n",
    "    Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
128
   "id": "9",
129
   "metadata": {},
130
   "outputs": [],
131
132
133
134
135
   "source": [
    "processed_prompt = processor(\n",
    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
    ")\n",
    "input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
136
    "precomputed_embeddings = vision(\n",
137
138
139
140
141
    "    processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
    ")\n",
    "\n",
    "mm_item = dict(\n",
    "    modality=\"IMAGE\",\n",
142
    "    image_grid_thw=processed_prompt[\"image_grid_thw\"],\n",
143
    "    precomputed_embeddings=precomputed_embeddings,\n",
144
145
146
147
    ")\n",
    "out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
    "print(out[\"text\"])"
   ]
148
149
150
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
151
   "id": "10",
152
153
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
154
    "## Querying Llama 4 (Vision)"
155
156
157
158
159
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
160
   "id": "11",
161
162
163
164
165
166
167
168
169
170
171
172
173
174
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()  # Run this first.\n",
    "\n",
    "model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
    "chat_template = \"llama-4\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
175
   "id": "12",
176
   "metadata": {},
177
   "outputs": [],
178
179
180
181
182
183
184
   "source": [
    "# Lets create a prompt.\n",
    "\n",
    "from io import BytesIO\n",
    "import requests\n",
    "from PIL import Image\n",
    "\n",
185
    "from sglang.srt.parser.conversation import chat_templates\n",
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    "\n",
    "image = Image.open(\n",
    "    BytesIO(\n",
    "        requests.get(\n",
    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
    "        ).content\n",
    "    )\n",
    ")\n",
    "\n",
    "conv = chat_templates[chat_template].copy()\n",
    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
    "conv.append_message(conv.roles[1], \"\")\n",
    "conv.image_data = [image]\n",
    "\n",
    "print(conv.get_prompt())\n",
    "print(f\"Image size: {image.size}\")\n",
    "\n",
    "image"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
208
   "id": "13",
209
210
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
211
    "### Query via the offline Engine API"
212
213
214
215
216
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
217
   "id": "14",
218
   "metadata": {},
219
   "outputs": [],
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
   "source": [
    "from sglang.test.test_utils import is_in_ci\n",
    "\n",
    "if not is_in_ci():\n",
    "    from sglang import Engine\n",
    "\n",
    "    llm = Engine(\n",
    "        model_path=model_path,\n",
    "        trust_remote_code=True,\n",
    "        enable_multimodal=True,\n",
    "        mem_fraction_static=0.8,\n",
    "        tp_size=4,\n",
    "        attention_backend=\"fa3\",\n",
    "        context_length=65536,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
240
   "id": "15",
241
   "metadata": {},
242
   "outputs": [],
243
244
245
246
247
248
249
250
   "source": [
    "if not is_in_ci():\n",
    "    out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
    "    print(out[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
251
   "id": "16",
252
253
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
254
    "### Query via the offline Engine API, but send precomputed embeddings"
255
256
257
258
259
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
260
   "id": "17",
261
   "metadata": {},
262
   "outputs": [],
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
   "source": [
    "if not is_in_ci():\n",
    "    # Compute the image embeddings using Huggingface.\n",
    "\n",
    "    from transformers import AutoProcessor\n",
    "    from transformers import Llama4ForConditionalGeneration\n",
    "\n",
    "    processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
    "    model = Llama4ForConditionalGeneration.from_pretrained(\n",
    "        model_path, torch_dtype=\"auto\"\n",
    "    ).eval()\n",
    "    vision = model.vision_model.cuda()\n",
    "    multi_modal_projector = model.multi_modal_projector.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
281
   "id": "18",
282
   "metadata": {},
283
   "outputs": [],
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
   "source": [
    "if not is_in_ci():\n",
    "    processed_prompt = processor(\n",
    "        images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
    "    )\n",
    "    print(f'{processed_prompt[\"pixel_values\"].shape=}')\n",
    "    input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
    "\n",
    "    image_outputs = vision(\n",
    "        processed_prompt[\"pixel_values\"].to(\"cuda\"), output_hidden_states=False\n",
    "    )\n",
    "    image_features = image_outputs.last_hidden_state\n",
    "    vision_flat = image_features.view(-1, image_features.size(-1))\n",
    "    precomputed_embeddings = multi_modal_projector(vision_flat)\n",
    "\n",
    "    mm_item = dict(modality=\"IMAGE\", precomputed_embeddings=precomputed_embeddings)\n",
    "    out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
    "    print(out[\"text\"])"
   ]
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "custom_cell_magics": "kql",
   "encoding": "# -*- coding: utf-8 -*-"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}