test_profile_sla_dryrun.py 23.8 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0

"""
Test suite for profile_sla dry-run functionality.

This test ensures that the profile_sla script can successfully run in dry-run mode
8
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
9
10
11
12
"""

import sys
from pathlib import Path
13
from unittest.mock import patch
14
15
16
17
18
19
20

import pytest

# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

21
22
23
24
25
26
27
28
29
try:
    from dynamo.profiler.profile_sla import run_profile  # noqa: E402
    from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
    from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
    from dynamo.profiler.utils.search_space_autogen import (  # noqa: E402
        auto_generate_search_space,
    )
except ImportError as _e:
    pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
30
31
32
33
34
35
36
37
38
39
40
41


# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
    """Override the logger fixture to prevent test directory creation.

    This replaces the logger fixture from tests/conftest.py that creates
    directories named after each test.
    """
    # Simply do nothing - no directories created, no file handlers added
    yield
42
43
44
45
46
47


class TestProfileSLADryRun:
    """Test class for profile_sla dry-run functionality."""

    @pytest.fixture
48
    def vllm_args(self, request):
49
50
51
        """Create arguments for vllm backend dry-run test."""

        class Args:
52
53
            def __init__(self):
                self.backend = "vllm"
54
                self.config = "examples/backends/vllm/deploy/disagg.yaml"
55
56
57
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
58
59
                self.model = ""
                self.dgd_image = ""
60
61
62
63
64
65
66
67
68
69
70
71
72
73
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
74
                self.aic_hf_id = None
75
76
                self.aic_backend = ""
                self.aic_backend_version = None
77
                self.num_gpus_per_node = 8
78
79
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
80
                self.deploy_after_profile = False
81
                self.pick_with_webui = False
82
83
84
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
85
86
87
88
89
90
91
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
92
93
94
95

        return Args()

    @pytest.fixture
96
    def sglang_args(self, request):
97
98
99
        """Create arguments for sglang backend dry-run test."""

        class Args:
100
101
            def __init__(self):
                self.backend = "sglang"
102
                self.config = "examples/backends/sglang/deploy/disagg.yaml"
103
104
105
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
106
107
                self.model = ""
                self.dgd_image = ""
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
122
                self.aic_hf_id = None
123
124
                self.aic_backend = ""
                self.aic_backend_version = None
125
                self.num_gpus_per_node = 8
126
127
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
128
                self.deploy_after_profile = False
129
                self.pick_with_webui = False
130
131
132
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
133
134
135
136
137
138
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
139
140
141
142

        return Args()

    @pytest.mark.pre_merge
143
    @pytest.mark.parallel
144
    @pytest.mark.asyncio
145
146
147
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.vllm
148
149
150
151
152
153
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(vllm_args)

    @pytest.mark.pre_merge
154
    @pytest.mark.parallel
155
    @pytest.mark.asyncio
156
157
158
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
159
160
161
162
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_args)
163
164

    @pytest.fixture
165
    def trtllm_args(self, request):
166
167
168
        """Create arguments for trtllm backend dry-run test."""

        class Args:
169
170
            def __init__(self):
                self.backend = "trtllm"
171
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
172
173
174
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
175
176
                self.model = ""
                self.dgd_image = ""
177
178
179
180
181
182
183
184
185
186
187
188
189
190
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
191
                self.aic_hf_id = None
192
193
                self.aic_backend = ""
                self.aic_backend_version = None
194
                self.num_gpus_per_node = 8
195
196
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
197
                self.deploy_after_profile = False
198
                self.pick_with_webui = False
199
200
201
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
202
203
204
205
206
207
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
208
209
210
211

        return Args()

    @pytest.mark.pre_merge
212
    @pytest.mark.parallel
213
    @pytest.mark.asyncio
214
215
216
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
217
218
219
220
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
221
222

    @pytest.fixture
223
    def sglang_moe_args(self, request):
224
225
226
227
228
        """Create arguments for trtllm backend dry-run test."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
229
                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
230
231
232
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
233
234
                self.model = ""
                self.dgd_image = ""
235
236
237
238
239
240
241
242
243
244
245
246
247
248
                self.min_num_gpus_per_engine = 8
                self.max_num_gpus_per_engine = 32
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
249
                self.aic_hf_id = None
250
251
                self.aic_backend = ""
                self.aic_backend_version = None
252
                self.num_gpus_per_node = 8
253
254
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
255
                self.deploy_after_profile = False
256
                self.pick_with_webui = False
257
258
259
260
                # Added in newer profiler versions; keep Args compatible with search_space_autogen
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
261
262
263
264
265
266
267
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
                    is_moe=True,
                    max_context_length=self.max_context_length,
                    num_experts=16,
                )
268
269
270
271

        return Args()

    @pytest.mark.pre_merge
272
    @pytest.mark.parallel
273
    @pytest.mark.asyncio
274
275
276
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
277
278
279
280
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_moe_args)
281
282
283
284
285
286
287
288
289
290
291
292
293
294

    # Example tests with mocked GPU inventory
    @pytest.fixture
    def mock_h100_gpu_info(self):
        """Mock GPU info for H100 80GB cluster."""
        return {
            "gpus_per_node": 8,
            "model": "h100_sxm",
            "vram": 81920,  # 80GB in MiB
        }

    @pytest.fixture
    def mock_model_info(self):
        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
295
296
297
298
299
300
        return ModelInfo(
            model_size=16384.0,  # 16GB model in MiB
            architecture="LlamaForCausalLM",
            is_moe=False,
            max_context_length=16384,
        )
301
302

    @pytest.fixture
303
    def vllm_args_with_model_autogen(self, request):
304
305
306
307
308
309
        """Create arguments for vllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "vllm"
                self.config = ""
310
311
312
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
313
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
314
                self.dgd_image = ""
315
316
317
                # Set to 0 to trigger auto-generation path
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
318
319
320
321
322
323
324
325
326
327
328
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
329
330
331
332
333
334
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
335
                self.deploy_after_profile = False
336
                self.pick_with_webui = False
337
338
339
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
340
341
342
343

        return Args()

    @pytest.mark.pre_merge
344
    @pytest.mark.parallel
345
    @pytest.mark.asyncio
346
347
348
    @pytest.mark.integration
    @pytest.mark.gpu_0
    @pytest.mark.vllm
349
    @patch("dynamo.profiler.utils.model_info.get_model_info")
350
351
352
353
354
355
356
357
358
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        vllm_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
359
360
        size and available GPU memory. GPU info is provided via command-line
        arguments injected by the Operator into the profiling config (DYN-2135).
361
        """
362
        # Configure the mock to return the appropriate model info
363
364
365
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
366
        # based on the model and GPU info from args
367
368
369
370
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

    @pytest.fixture
371
    def sglang_args_with_model_autogen(self, request):
372
373
374
375
376
377
        """Create arguments for sglang backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
                self.config = ""
378
379
380
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
381
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
382
                self.dgd_image = ""
383
384
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
385
386
387
388
389
390
391
392
393
394
395
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
396
397
398
399
400
401
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
402
                self.deploy_after_profile = False
403
                self.pick_with_webui = False
404
405
406
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
407
408
409
410

        return Args()

    @pytest.mark.pre_merge
411
    @pytest.mark.parallel
412
    @pytest.mark.asyncio
413
414
415
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
416
417
418
419
420
421
    @pytest.mark.skip(
        reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
        "is in legacy format missing 'gemm_dtype' field. "
        "See: KeyError in aiconfigurator/sdk/perf_database.py"
    )
    @patch("dynamo.profiler.utils.model_info.get_model_info")
422
423
424
425
426
427
428
429
430
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        sglang_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
431
432
433
434
435
436
        size and available GPU memory for sglang backend. GPU info is provided via
        command-line arguments injected by the Operator into the profiling config (DYN-2135).

        NOTE: Currently skipped due to AI Configurator database format issue.
        The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
        the required 'gemm_dtype' field, causing KeyError during database loading.
437
        """
438
        # Configure the mock to return the appropriate model info
439
440
441
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
442
        # based on the model and GPU info from args
443
444
445
446
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

    @pytest.fixture
447
    def trtllm_args_with_model_autogen(self, request):
448
449
450
451
452
453
        """Create arguments for trtllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "trtllm"
                self.config = ""
454
455
456
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
457
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
458
                self.dgd_image = ""
459
460
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
461
462
463
464
465
466
467
468
469
470
471
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
472
473
474
475
476
477
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
478
                self.deploy_after_profile = False
479
                self.pick_with_webui = False
480
481
482
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
483
484
485
486

        return Args()

    @pytest.mark.pre_merge
487
    @pytest.mark.parallel
488
    @pytest.mark.asyncio
489
490
491
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
492
    @patch("dynamo.profiler.utils.model_info.get_model_info")
493
494
495
496
497
498
499
500
501
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        trtllm_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
502
503
        size and available GPU memory for trtllm backend. GPU info is provided via
        command-line arguments injected by the Operator into the profiling config (DYN-2135).
504
        """
505
        # Configure the mock to return the appropriate model info
506
507
508
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
509
        # based on the model and GPU info from args
510
511
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579

    # Unit tests for search_strategy and system attributes
    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_vllm_args_has_search_strategy(self, vllm_args):
        """Test that vllm_args fixture has search_strategy attribute."""
        assert hasattr(vllm_args, "search_strategy")
        assert vllm_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(vllm_args, "system")
        assert vllm_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_sglang_args_has_search_strategy(self, sglang_args):
        """Test that sglang_args fixture has search_strategy attribute."""
        assert hasattr(sglang_args, "search_strategy")
        assert sglang_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(sglang_args, "system")
        assert sglang_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_trtllm_args_has_search_strategy(self, trtllm_args):
        """Test that trtllm_args fixture has search_strategy attribute."""
        assert hasattr(trtllm_args, "search_strategy")
        assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(trtllm_args, "system")
        assert trtllm_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
        """Test that sglang_moe_args fixture has search_strategy attribute."""
        assert hasattr(sglang_moe_args, "search_strategy")
        assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(sglang_moe_args, "system")
        assert sglang_moe_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_model_autogen_args_have_rapid_strategy(
        self,
        vllm_args_with_model_autogen,
        sglang_args_with_model_autogen,
        trtllm_args_with_model_autogen,
    ):
        """Test that model autogen fixtures have RAPID search strategy and GPU info."""
        for args_fixture in [
            vllm_args_with_model_autogen,
            sglang_args_with_model_autogen,
            trtllm_args_with_model_autogen,
        ]:
            assert hasattr(args_fixture, "search_strategy")
            assert args_fixture.search_strategy == SearchStrategy.RAPID
            assert hasattr(args_fixture, "system")
            assert args_fixture.system == "h100_sxm"
            # Verify GPU discovery attributes
            assert hasattr(args_fixture, "num_gpus_per_node")
            assert args_fixture.num_gpus_per_node == 8
            assert hasattr(args_fixture, "gpu_model")
            assert args_fixture.gpu_model == "H100-SXM5-80GB"
            assert hasattr(args_fixture, "gpu_vram_mib")
            assert args_fixture.gpu_vram_mib == 81920