"test/vscode:/vscode.git/clone" did not exist on "557618baa6b7807763b9d8916f96eee0b57ea33a"
vlm_query.ipynb 8.13 KB
Newer Older
1
2
3
4
5
6
7
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
8
9
10
11
12
13
14
15
16
    "# Query Vision Language Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {},
   "source": [
    "## Querying Qwen-VL"
17
18
19
20
21
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
22
   "id": "2",
23
24
25
26
27
28
29
30
31
32
33
34
35
36
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()  # Run this first.\n",
    "\n",
    "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
    "chat_template = \"qwen2-vl\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
37
   "id": "3",
38
   "metadata": {},
39
   "outputs": [],
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
   "source": [
    "# Lets create a prompt.\n",
    "\n",
    "from io import BytesIO\n",
    "import requests\n",
    "from PIL import Image\n",
    "\n",
    "from sglang.srt.conversation import chat_templates\n",
    "\n",
    "image = Image.open(\n",
    "    BytesIO(\n",
    "        requests.get(\n",
    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
    "        ).content\n",
    "    )\n",
    ")\n",
    "\n",
    "conv = chat_templates[chat_template].copy()\n",
    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
    "conv.append_message(conv.roles[1], \"\")\n",
    "conv.image_data = [image]\n",
    "\n",
    "print(conv.get_prompt())\n",
    "image"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
68
   "id": "4",
69
70
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
71
    "### Query via the offline Engine API"
72
73
74
75
76
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
77
   "id": "5",
78
   "metadata": {},
79
   "outputs": [],
80
81
82
83
84
85
86
87
88
89
90
   "source": [
    "from sglang import Engine\n",
    "\n",
    "llm = Engine(\n",
    "    model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
91
   "id": "6",
92
   "metadata": {},
93
   "outputs": [],
94
95
96
97
98
99
100
   "source": [
    "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
    "print(out[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
101
   "id": "7",
102
103
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
104
    "### Query via the offline Engine API, but send precomputed embeddings"
105
106
107
108
109
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
110
   "id": "8",
111
   "metadata": {},
112
   "outputs": [],
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
   "source": [
    "# Compute the image embeddings using Huggingface.\n",
    "\n",
    "from transformers import AutoProcessor\n",
    "from transformers import Qwen2_5_VLForConditionalGeneration\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
    "vision = (\n",
    "    Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
128
   "id": "9",
129
   "metadata": {},
130
   "outputs": [],
131
132
133
134
135
   "source": [
    "processed_prompt = processor(\n",
    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
    ")\n",
    "input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
136
    "precomputed_embeddings = vision(\n",
137
138
139
140
141
    "    processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
    ")\n",
    "\n",
    "mm_item = dict(\n",
    "    modality=\"IMAGE\",\n",
142
    "    image_grid_thw=processed_prompt[\"image_grid_thw\"],\n",
143
    "    precomputed_embeddings=precomputed_embeddings,\n",
144
145
146
147
    ")\n",
    "out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
    "print(out[\"text\"])"
   ]
148
149
150
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
151
   "id": "10",
152
153
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
154
    "## Querying Llama 4 (Vision)"
155
156
157
158
159
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
160
   "id": "11",
161
162
163
164
165
166
167
168
169
170
171
172
173
174
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()  # Run this first.\n",
    "\n",
    "model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
    "chat_template = \"llama-4\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
175
   "id": "12",
176
   "metadata": {},
177
   "outputs": [],
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
   "source": [
    "# Lets create a prompt.\n",
    "\n",
    "from io import BytesIO\n",
    "import requests\n",
    "from PIL import Image\n",
    "\n",
    "from sglang.srt.conversation import chat_templates\n",
    "\n",
    "image = Image.open(\n",
    "    BytesIO(\n",
    "        requests.get(\n",
    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
    "        ).content\n",
    "    )\n",
    ")\n",
    "\n",
    "conv = chat_templates[chat_template].copy()\n",
    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
    "conv.append_message(conv.roles[1], \"\")\n",
    "conv.image_data = [image]\n",
    "\n",
    "print(conv.get_prompt())\n",
    "print(f\"Image size: {image.size}\")\n",
    "\n",
    "image"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
208
   "id": "13",
209
210
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
211
    "### Query via the offline Engine API"
212
213
214
215
216
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
217
   "id": "14",
218
   "metadata": {},
219
   "outputs": [],
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
   "source": [
    "from sglang.test.test_utils import is_in_ci\n",
    "\n",
    "if not is_in_ci():\n",
    "    from sglang import Engine\n",
    "\n",
    "    llm = Engine(\n",
    "        model_path=model_path,\n",
    "        trust_remote_code=True,\n",
    "        enable_multimodal=True,\n",
    "        mem_fraction_static=0.8,\n",
    "        tp_size=4,\n",
    "        attention_backend=\"fa3\",\n",
    "        context_length=65536,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
240
   "id": "15",
241
   "metadata": {},
242
   "outputs": [],
243
244
245
246
247
248
249
250
   "source": [
    "if not is_in_ci():\n",
    "    out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
    "    print(out[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
Lianmin Zheng's avatar
Lianmin Zheng committed
251
   "id": "16",
252
253
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
254
    "### Query via the offline Engine API, but send precomputed embeddings"
255
256
257
258
259
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
260
   "id": "17",
261
   "metadata": {},
262
   "outputs": [],
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
   "source": [
    "if not is_in_ci():\n",
    "    # Compute the image embeddings using Huggingface.\n",
    "\n",
    "    from transformers import AutoProcessor\n",
    "    from transformers import Llama4ForConditionalGeneration\n",
    "\n",
    "    processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
    "    model = Llama4ForConditionalGeneration.from_pretrained(\n",
    "        model_path, torch_dtype=\"auto\"\n",
    "    ).eval()\n",
    "    vision = model.vision_model.cuda()\n",
    "    multi_modal_projector = model.multi_modal_projector.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Lianmin Zheng's avatar
Lianmin Zheng committed
281
   "id": "18",
282
   "metadata": {},
283
   "outputs": [],
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
   "source": [
    "if not is_in_ci():\n",
    "    processed_prompt = processor(\n",
    "        images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
    "    )\n",
    "    print(f'{processed_prompt[\"pixel_values\"].shape=}')\n",
    "    input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
    "\n",
    "    image_outputs = vision(\n",
    "        processed_prompt[\"pixel_values\"].to(\"cuda\"), output_hidden_states=False\n",
    "    )\n",
    "    image_features = image_outputs.last_hidden_state\n",
    "    vision_flat = image_features.view(-1, image_features.size(-1))\n",
    "    precomputed_embeddings = multi_modal_projector(vision_flat)\n",
    "\n",
    "    mm_item = dict(modality=\"IMAGE\", precomputed_embeddings=precomputed_embeddings)\n",
    "    out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
    "    print(out[\"text\"])"
   ]
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "custom_cell_magics": "kql",
   "encoding": "# -*- coding: utf-8 -*-"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}