test_vllm.py 31 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import base64
5
import dataclasses
6
7
import logging
import os
8
import random
9
from dataclasses import dataclass, field
10
from typing import Optional
11
12
13

import pytest

14
15
16
17
18
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
19
from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
20
from tests.serve.lora_utils import MinioLoraConfig
21
22
23
24
from tests.serve.multimodal_profiles.vllm import (
    VLLM_MULTIMODAL_PROFILES,
    VLLM_TOPOLOGY_SCRIPTS,
)
25
from tests.utils.constants import DefaultPort
26
from tests.utils.engine_process import EngineConfig
27
from tests.utils.multimodal import make_multimodal_configs
28
from tests.utils.payload_builder import (
29
    cached_tokens_chat_payload,
30
31
    chat_payload,
    chat_payload_default,
32
    chat_payload_with_logprobs,
33
    completion_payload_default,
34
    completion_payload_with_logprobs,
35
    metric_payload_default,
36
)
37
from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload
38
39
40
41

logger = logging.getLogger(__name__)


42
43
44
45
46
47
def _is_cuda13() -> bool:
    v = os.environ.get("CUDA_VERSION", "")
    # handles "13", "13.0", "13.0.1", etc.
    return v.startswith("13")


48
@dataclass
49
class VLLMConfig(EngineConfig):
50
51
    """Configuration for vLLM test scenarios"""

52
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
53
54


55
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
56
    WORKSPACE_DIR, "examples/backends/vllm"
57
)
58

59
60
61
62
63
64
# Generated multimodal configs from profile definitions
_mm_configs: dict[str, VLLMConfig] = {}
for _profile in VLLM_MULTIMODAL_PROFILES:
    _mm_configs.update(
        make_multimodal_configs(_profile, VLLMConfig, vllm_dir, VLLM_TOPOLOGY_SCRIPTS)
    )
65

66
# vLLM test configurations
67
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
68
# TODO: Now that these tests use dynamic ports and each config has VRAM markers,
69
# optimize the runtime by bin-packing multiple engine deployments in parallel on the same GPU.
70
# A future collector/launcher can sum profiled_vram_gib values to decide how many tests fit
71
# concurrently without exceeding available VRAM.
72
vllm_configs = {
73
    **_mm_configs,
74
75
    "aggregated": VLLMConfig(
        name="aggregated",
76
        directory=vllm_dir,
77
        script_name="agg.sh",
78
79
        marks=[
            pytest.mark.gpu_1,
80
81
82
83
84
85
86
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
            pytest.mark.timeout(
                360
            ),  # ~8.5x observed 42.2s; bumped for GPU-parallel headroom
87
88
            pytest.mark.pre_merge,
        ],
89
        model="Qwen/Qwen3-0.6B",
90
91
92
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
93
94
95
96
97
98
99
100
101
102
103
            chat_payload(
                "Can you write me a song?",
                repeat_count=1,
                expected_response=["song"],
                temperature=0.0,
                max_tokens=32,
                extra_body={
                    "stop": ["song"],
                    "include_stop_str_in_output": True,
                },
            ),
104
            metric_payload_default(min_num_requests=6, backend="vllm"),
105
        ],
106
    ),
107
108
109
110
    "aggregated_logprobs": VLLMConfig(
        name="aggregated_logprobs",
        directory=vllm_dir,
        script_name="agg.sh",
111
112
        marks=[
            pytest.mark.gpu_1,
113
114
115
116
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
117
            pytest.mark.timeout(120),  # ~5x observed 24.3s; CI machines are slower
118
119
            pytest.mark.post_merge,
        ],
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_with_logprobs(
                repeat_count=2,
                expected_response=["AI", "knock", "joke"],
                max_tokens=30,
                temperature=0.0,
                top_logprobs=3,
            ),
            completion_payload_with_logprobs(
                repeat_count=2,
                expected_response=["AI", "knock", "joke"],
                max_tokens=30,
                temperature=0.0,
                logprobs=5,
            ),
        ],
    ),
138
139
140
141
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
142
        marks=[
143
            pytest.mark.lmcache,
144
            pytest.mark.gpu_1,
145
146
147
148
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
149
            pytest.mark.timeout(360),  # ~7x observed 49.0s; old value before profiling
150
            pytest.mark.pre_merge,
151
            pytest.mark.skipif(
152
153
154
                _is_cuda13(),
                reason="lmcache does not support CUDA 13 as of v0.3.11",
            ),
155
        ],
156
157
158
159
160
161
162
163
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
164
165
166
167
    "aggregated_lmcache_multiproc": VLLMConfig(
        name="aggregated_lmcache_multiproc",
        directory=vllm_dir,
        script_name="agg_lmcache_multiproc.sh",
168
        marks=[
169
            pytest.mark.lmcache,
170
            pytest.mark.gpu_1,
171
172
173
174
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
175
            pytest.mark.timeout(360),  # ~7x observed 49.3s; old value before profiling
176
            pytest.mark.pre_merge,
177
            pytest.mark.skipif(
178
179
180
                _is_cuda13(),
                reason="lmcache does not support CUDA 13 as of v0.3.11",
            ),
181
        ],
182
183
184
185
186
187
188
189
190
191
192
        model="Qwen/Qwen3-0.6B",
        env={
            "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
        },
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
193
194
195
196
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
197
198
        marks=[
            pytest.mark.gpu_1,
199
200
201
202
203
204
205
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
            pytest.mark.timeout(
                360
            ),  # ~8x observed 43.0s; bumped for GPU-parallel headroom
206
207
            pytest.mark.pre_merge,
        ],
208
209
210
211
212
213
214
215
216
217
218
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
219
220
        marks=[
            pytest.mark.gpu_1,
221
222
223
224
225
226
227
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
            pytest.mark.timeout(
                360
            ),  # ~8.5x observed 42.3s; bumped for GPU-parallel headroom
228
229
            pytest.mark.pre_merge,
        ],
230
231
232
233
234
235
236
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
237
238
    "agg-router": VLLMConfig(
        name="agg-router",
239
        directory=vllm_dir,
240
        script_name="agg_router.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
241
242
        marks=[
            pytest.mark.gpu_2,
243
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
244
            pytest.mark.skip(reason="DYN-2263"),
245
        ],  # TODO: profile to get max_vram and timeout
246
        model="Qwen/Qwen3-0.6B",
247
248
249
        request_payloads=[
            chat_payload_default(
                expected_log=[
250
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
251
                    r"Event processor for worker_id \d+ processing event: Stored\(",
252
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
253
254
255
256
                ]
            )
        ],
        env={
257
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
258
        },
259
    ),
260
261
262
263
    "agg-router-approx": VLLMConfig(
        name="agg-router-approx",
        directory=vllm_dir,
        script_name="agg_router_approx.sh",
Dmitry Tokarev's avatar
Dmitry Tokarev committed
264
265
        marks=[
            pytest.mark.gpu_2,
266
            pytest.mark.pre_merge,
Dmitry Tokarev's avatar
Dmitry Tokarev committed
267
            pytest.mark.skip(reason="DYN-2264"),
268
        ],  # TODO: profile to get max_vram and timeout
269
270
271
272
273
274
275
276
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            # Test approximate KV routing (--no-kv-events mode)
            # Repeated requests should show cache-aware routing in logs
            chat_payload_default(
                repeat_count=3,
                expected_log=[
                    # Verify scheduler is selecting workers with cache awareness
277
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
278
279
280
281
282
283
284
285
286
287
288
289
290
291
                    # After first request, should see cached blocks being tracked
                    r"with \d+ cached blocks",
                ],
            ),
            # Also test with cached tokens payload to verify usage field
            cached_tokens_chat_payload(
                repeat_count=3,
                expected_log=[
                    # Verify routing decision shows cache hits
                    r"with \d+ cached blocks",
                ],
            ),
        ],
        env={
292
            "DYN_LOG": "dynamo_kv_router::scheduling::selector=info",
293
294
        },
    ),
295
296
    "disaggregated": VLLMConfig(
        name="disaggregated",
297
        directory=vllm_dir,
298
        script_name="disagg.sh",
299
300
301
302
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
        ],  # TODO: profile to get max_vram and timeout
303
        model="Qwen/Qwen3-0.6B",
304
305
306
307
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
308
    ),
309
310
    "deepep": VLLMConfig(
        name="deepep",
311
        directory=vllm_dir,
312
        script_name="dsr1_dep.sh",
313
314
315
316
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
317
            pytest.mark.nightly,
318
            # TODO: profile to get max_vram and timeout
319
        ],
320
        model="deepseek-ai/DeepSeek-V2-Lite",
321
        script_args=[
322
323
324
325
326
327
328
329
330
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
331
        timeout=700,
332
        request_payloads=[
333
334
            chat_payload_default(),
            completion_payload_default(),
335
        ],
336
    ),
337
338
339
340
    "multimodal_agg_frontend_decoding": VLLMConfig(
        name="multimodal_agg_frontend_decoding",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
341
        # post_merge because needs real NIXL not stub
342
343
        marks=[
            pytest.mark.gpu_1,
344
345
346
347
            pytest.mark.profiled_vram_gib(9.6),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_710_490_000
            ),  # KV cache cap (2x safety over min=855_244_800)
348
            pytest.mark.timeout(220),  # ~5x observed 43.7s; 2B model loads slower on CI
349
350
            pytest.mark.post_merge,
        ],
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
        model="Qwen/Qwen2-VL-2B-Instruct",
        # Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
        script_args=[
            "--model",
            "Qwen/Qwen2-VL-2B-Instruct",
            "--frontend-decoding",
        ],
        request_payloads=[
            chat_payload(
                [
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": MULTIMODAL_IMG_URL},
                    },
                ],
                repeat_count=1,
                expected_response=["green"],
                temperature=0.0,
                max_tokens=100,
            )
        ],
    ),
377
378
379
380
381
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
382
            pytest.mark.gpu_1,
383
384
385
386
            pytest.mark.profiled_vram_gib(14.9),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                922_354_000
            ),  # KV cache cap (2x safety over min=461_176_832)
387
388
389
            pytest.mark.timeout(
                300
            ),  # ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
390
            pytest.mark.nightly,
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        delayed_start=0,
        timeout=360,
        request_payloads=[
            # HTTP URL test
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            ),
            # String content test - verifies string → array conversion for multimodal templates
            chat_payload_default(
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
        ],
    ),
421
422
423
424
    "aggregated_toolcalling": VLLMConfig(
        name="aggregated_toolcalling",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
425
        marks=[
426
427
428
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2604"
            ),
429
            pytest.mark.gpu_1,  # agg_multimodal.sh uses single GPU
430
431
            pytest.mark.multimodal,
            pytest.mark.nightly,
432
        ],
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
        model="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
        script_args=[
            "--model",
            "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
            "--max-model-len",
            "10000",
            "--dyn-tool-call-parser",
            "hermes",
        ],
        delayed_start=0,
        timeout=600,
        request_payloads=[
            ToolCallingChatPayload(
                body={
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "Describe what you see in this image in detail.",
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {"url": MULTIMODAL_IMG_URL},
                                },
                            ],
                        }
                    ],
                    "tools": [
                        {
                            "type": "function",
                            "function": {
                                "name": "describe_image",
                                "description": "Provides detailed description of objects and scenes in an image",
                                "parameters": {
                                    "type": "object",
                                    "properties": {
                                        "objects": {
                                            "type": "array",
                                            "items": {"type": "string"},
                                            "description": "List of objects detected in the image",
                                        },
                                        "scene": {
                                            "type": "string",
                                            "description": "Overall scene description",
                                        },
                                    },
                                    "required": ["objects", "scene"],
                                },
                            },
                        }
                    ],
                    "tool_choice": "auto",
                    "max_tokens": 1024,
                },
                repeat_count=1,
490
491
492
493
494
495
496
                expected_response=[
                    "green",
                    "purple",
                    "llm",
                    "optimize",
                    "deploy",
                ],  # OR: pass if any keyword found in tool args
497
498
499
500
501
                expected_log=[],
                expected_tool_name="describe_image",  # Validate tool call happened
            )
        ],
    ),
502
503
504
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
505
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
506
507
508
509
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
510
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
511
    # ),
512
513
514
515
    "completions_only": VLLMConfig(
        name="completions_only",
        directory=vllm_dir,
        script_name="agg.sh",
516
517
        marks=[
            pytest.mark.gpu_1,
518
519
520
521
            pytest.mark.profiled_vram_gib(18.3),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                4_074_898_000
            ),  # KV cache cap (2x safety over min=2_037_448_704)
522
523
524
            pytest.mark.timeout(
                420
            ),  # 7B model loads ~48s on CI (A10G/L4) vs ~15s locally
525
            pytest.mark.post_merge,
526
        ],
527
528
529
530
531
532
533
534
535
536
537
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model",
            "deepseek-ai/deepseek-llm-7b-base",
            "--dyn-endpoint-types",
            "completions",
        ],
        request_payloads=[
            completion_payload_default(),
        ],
    ),
538
539
540
541
542
543
    "multi_node_tp_headless": VLLMConfig(
        name="multi_node_tp_headless",
        directory=os.path.join(WORKSPACE_DIR, "tests/serve"),
        script_name="multi_node_tp_headless.sh",
        marks=[
            pytest.mark.gpu_2,
544
            pytest.mark.pre_merge,
545
            # TODO: profile to get max_vram
546
547
548
549
550
551
552
553
            pytest.mark.timeout(300),
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
554
555
    "guided_decoding": VLLMConfig(
        name="guided_decoding",
556
557
        directory=vllm_dir,
        script_name="agg.sh",
558
559
        marks=[
            pytest.mark.gpu_1,
560
561
562
563
            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
                1_119_388_000
            ),  # KV cache cap (2x safety over min=559_693_824)
564
            pytest.mark.timeout(110),  # ~5x observed 22.3s; CI machines are slower
565
566
            pytest.mark.pre_merge,
        ],
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload(
                "Generate a person with name and age",
                repeat_count=1,
                expected_response=['"name"', '"age"'],
                temperature=0.0,
                max_tokens=100,
                extra_body={
                    "guided_json": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "age": {"type": "integer"},
                        },
                        "required": ["name", "age"],
                    }
                },
585
            ),
586
587
588
589
590
591
592
            chat_payload(
                "Generate a color name (red, blue, or green)",
                repeat_count=1,
                expected_response=["red", "blue", "green"],
                temperature=0.0,
                max_tokens=20,
                extra_body={"guided_regex": r"(red|blue|green)"},
593
            ),
594
595
596
597
598
599
600
            chat_payload(
                "Generate a color name (red, blue, or green)",
                repeat_count=1,
                expected_response=["red", "blue", "green"],
                temperature=0.0,
                max_tokens=20,
                extra_body={"guided_choice": ["red", "blue", "green"]},
601
            ),
602
603
        ],
    ),
604
605
606
}


Alec's avatar
Alec committed
607
@pytest.fixture(params=params_with_model_mark(vllm_configs))
608
609
610
611
612
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


613
@pytest.mark.vllm
614
@pytest.mark.e2e
Alec's avatar
Alec committed
615
def test_serve_deployment(
616
617
618
619
620
621
    vllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    image_server,
Alec's avatar
Alec committed
622
):
623
624
625
    """
    Test dynamo serve deployments with different graph configurations.
    """
626
627
628
629
    config = dataclasses.replace(
        vllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
630
631


632
@pytest.mark.skip(reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2605")
633
634
635
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
636
@pytest.mark.nightly
637
638
@pytest.mark.timeout(360)  # Match VLLMConfig.timeout for this multimodal deployment
def test_multimodal_b64(
639
640
641
642
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
643
):
644
645
646
647
648
649
    """
    Test multimodal inference with base64 url passthrough.

    This test is separate because it loads the required image at runtime
    (not collection time), ensuring it only fails when actually executed.
    """
650
651
    # Load B64 image at test execution time (uses real PNG even if MULTIMODAL_IMG is LFS pointer)
    b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682

    # Create payload with B64 image
    b64_payload = chat_payload(
        [
            {
                "type": "text",
                "text": "What colors are in the following image? Respond only with the colors.",
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
            },
        ],
        repeat_count=1,
        expected_response=["purple"],
        max_tokens=100,
    )

    # Create test config
    config = VLLMConfig(
        name="test_multimodal_b64",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
        timeout=360,
        request_payloads=[b64_payload],
    )

683
684
685
686
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
687
688


689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.pre_merge
@pytest.mark.timeout(220)
def test_multimodal_b64_frontend_decoding(
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
):
    """
    Test multimodal inference with base64 images through frontend decoding path.

    This exercises the Rust frontend image decode + NIXL RDMA transfer path
    with inline base64 data: URIs (not HTTP URLs). Verifies that the
    strip_inline_data_urls optimization does not break correctness.
    """
    b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()

    b64_payload = chat_payload(
        [
            {
                "type": "text",
                "text": "What colors are in the following image? Respond only with the colors.",
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
            },
        ],
        repeat_count=1,
        expected_response=["green"],
        temperature=0.0,
        max_tokens=100,
    )

    config = VLLMConfig(
        name="test_multimodal_b64_frontend_decoding",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[],
        model="Qwen/Qwen3-VL-2B-Instruct",
        script_args=[
            "--model",
            "Qwen/Qwen3-VL-2B-Instruct",
            "--frontend-decoding",
        ],
        delayed_start=0,
        timeout=220,
        request_payloads=[b64_payload],
    )

    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)


748
749
750
751
752
753
754
# LoRA Test Directory
lora_dir = os.path.join(vllm_dir, "launch/lora")


def lora_chat_payload(
    lora_name: str,
    s3_uri: str,
755
    system_port: int = DefaultPort.SYSTEM1.value,
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
    repeat_count: int = 2,
    expected_response: Optional[list] = None,
    expected_log: Optional[list] = None,
    max_tokens: int = 100,
    temperature: float = 0.0,
) -> LoraTestChatPayload:
    """Create a LoRA-enabled chat payload for testing"""
    return LoraTestChatPayload(
        body={
            "model": lora_name,
            "messages": [
                {
                    "role": "user",
                    "content": "What is deep learning? Answer in one sentence.",
                }
            ],
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": False,
        },
        lora_name=lora_name,
        s3_uri=s3_uri,
        system_port=system_port,
        repeat_count=repeat_count,
        expected_response=expected_response
        or ["learning", "neural", "network", "AI", "model"],
        expected_log=expected_log or [],
    )


@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600)
791
@pytest.mark.post_merge
Dmitry Tokarev's avatar
Dmitry Tokarev committed
792
@pytest.mark.skip(reason="DYN-2260")
793
def test_lora_aggregated(
794
795
796
797
798
    request,
    runtime_services_dynamic_ports,
    predownload_models,
    minio_lora_service,
    dynamo_dynamic_ports,
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
):
    """
    Test LoRA inference with aggregated vLLM deployment.

    This test:
    1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
    2. Starts vLLM with LoRA support enabled
    3. Loads the LoRA adapter via system API
    4. Runs inference with the LoRA model
    """
    minio_config: MinioLoraConfig = minio_lora_service

    # Create payload that loads LoRA and tests inference
    lora_payload = lora_chat_payload(
        lora_name=minio_config.lora_name,
        s3_uri=minio_config.get_s3_uri(),
815
        system_port=DefaultPort.SYSTEM1.value,
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
        repeat_count=2,
    )

    # Create test config with MinIO environment variables
    config = VLLMConfig(
        name="test_lora_aggregated",
        directory=vllm_dir,
        script_name="lora/agg_lora.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen3-0.6B",
        timeout=600,
        env=minio_config.get_env_vars(),
        request_payloads=[lora_payload],
    )

831
832
833
834
835
836
837
838
839
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(
        config,
        request,
        ports=dynamo_dynamic_ports,
        extra_env=minio_config.get_env_vars(),
    )
840
841
842
843
844
845
846


@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600)
847
@pytest.mark.pre_merge
848
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
849
def test_lora_aggregated_router(
850
851
852
853
854
    request,
    runtime_services_dynamic_ports,
    predownload_models,
    minio_lora_service,
    dynamo_dynamic_ports,
855
    num_system_ports,
856
857
858
859
860
861
862
863
864
865
):
    """
    Test LoRA inference with aggregated vLLM deployment using KV router.

    This test:
    1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
    2. Starts multiple vLLM workers with LoRA support and KV router
    3. Loads the LoRA adapter on both workers via system API
    4. Runs inference with the LoRA model, verifying KV cache routing
    """
866
867
868
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
869
870
871
    minio_config: MinioLoraConfig = minio_lora_service

    # Create payloads that load LoRA on both workers and test inference
872
    # Worker 1 (DefaultPort.SYSTEM1)
873
874
875
    lora_payload_worker1 = lora_chat_payload(
        lora_name=minio_config.lora_name,
        s3_uri=minio_config.get_s3_uri(),
876
        system_port=DefaultPort.SYSTEM1.value,
877
878
879
        repeat_count=1,
    )

880
    # Worker 2 (DefaultPort.SYSTEM2)
881
882
883
    lora_payload_worker2 = lora_chat_payload(
        lora_name=minio_config.lora_name,
        s3_uri=minio_config.get_s3_uri(),
884
        system_port=DefaultPort.SYSTEM2.value,
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
        repeat_count=1,
    )

    # Additional inference payload to test routing (LoRA already loaded)
    inference_payload = chat_payload(
        content="Explain machine learning in simple terms.",
        repeat_count=2,
        expected_response=["learn", "data", "algorithm", "model", "pattern"],
        max_tokens=150,
        temperature=0.0,
    ).with_model(minio_config.lora_name)

    # Add env vars including PYTHONHASHSEED for deterministic KV event IDs
    env_vars = minio_config.get_env_vars()
    env_vars["PYTHONHASHSEED"] = "0"

    # Create test config with MinIO environment variables
    config = VLLMConfig(
        name="test_lora_aggregated_router",
        directory=vllm_dir,
        script_name="lora/agg_lora_router.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen3-0.6B",
        timeout=600,
        env=env_vars,
        request_payloads=[
            lora_payload_worker1,
            lora_payload_worker2,
            inference_payload,
        ],
    )

917
918
919
920
921
922
    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(
        config, request, ports=dynamo_dynamic_ports, extra_env=env_vars
    )