test_profile_sla_dryrun.py 23.7 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0

"""
Test suite for profile_sla dry-run functionality.

This test ensures that the profile_sla script can successfully run in dry-run mode
8
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
9
10
11
12
"""

import sys
from pathlib import Path
13
from unittest.mock import patch
14
15
16
17
18
19
20

import pytest

# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

21
from dynamo.profiler.profile_sla import run_profile  # noqa: E402
22
from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
23
24
from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
from dynamo.profiler.utils.search_space_autogen import (  # noqa: E402
25
26
27
28
29
30
31
32
33
34
35
36
37
38
    auto_generate_search_space,
)


# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
    """Override the logger fixture to prevent test directory creation.

    This replaces the logger fixture from tests/conftest.py that creates
    directories named after each test.
    """
    # Simply do nothing - no directories created, no file handlers added
    yield
39
40
41
42
43
44


class TestProfileSLADryRun:
    """Test class for profile_sla dry-run functionality."""

    @pytest.fixture
45
    def vllm_args(self, request):
46
47
48
        """Create arguments for vllm backend dry-run test."""

        class Args:
49
50
            def __init__(self):
                self.backend = "vllm"
51
                self.config = "examples/backends/vllm/deploy/disagg.yaml"
52
53
54
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
55
56
                self.model = ""
                self.dgd_image = ""
57
58
59
60
61
62
63
64
65
66
67
68
69
70
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
71
                self.aic_hf_id = None
72
73
                self.aic_backend = ""
                self.aic_backend_version = None
74
                self.num_gpus_per_node = 8
75
76
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
77
                self.deploy_after_profile = False
78
                self.pick_with_webui = False
79
80
81
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
82
83
84
85
86
87
88
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
89
90
91
92

        return Args()

    @pytest.fixture
93
    def sglang_args(self, request):
94
95
96
        """Create arguments for sglang backend dry-run test."""

        class Args:
97
98
            def __init__(self):
                self.backend = "sglang"
99
                self.config = "examples/backends/sglang/deploy/disagg.yaml"
100
101
102
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
103
104
                self.model = ""
                self.dgd_image = ""
105
106
107
108
109
110
111
112
113
114
115
116
117
118
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
119
                self.aic_hf_id = None
120
121
                self.aic_backend = ""
                self.aic_backend_version = None
122
                self.num_gpus_per_node = 8
123
124
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
125
                self.deploy_after_profile = False
126
                self.pick_with_webui = False
127
128
129
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
130
131
132
133
134
135
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
136
137
138
139

        return Args()

    @pytest.mark.pre_merge
140
    @pytest.mark.parallel
141
    @pytest.mark.asyncio
142
143
144
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.vllm
145
146
147
148
149
150
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(vllm_args)

    @pytest.mark.pre_merge
151
    @pytest.mark.parallel
152
    @pytest.mark.asyncio
153
154
155
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
156
157
158
159
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_args)
160
161

    @pytest.fixture
162
    def trtllm_args(self, request):
163
164
165
        """Create arguments for trtllm backend dry-run test."""

        class Args:
166
167
            def __init__(self):
                self.backend = "trtllm"
168
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
169
170
171
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
172
173
                self.model = ""
                self.dgd_image = ""
174
175
176
177
178
179
180
181
182
183
184
185
186
187
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
188
                self.aic_hf_id = None
189
190
                self.aic_backend = ""
                self.aic_backend_version = None
191
                self.num_gpus_per_node = 8
192
193
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
194
                self.deploy_after_profile = False
195
                self.pick_with_webui = False
196
197
198
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
199
200
201
202
203
204
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
205
206
207
208

        return Args()

    @pytest.mark.pre_merge
209
    @pytest.mark.parallel
210
    @pytest.mark.asyncio
211
212
213
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
214
215
216
217
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
218
219

    @pytest.fixture
220
    def sglang_moe_args(self, request):
221
222
223
224
225
        """Create arguments for trtllm backend dry-run test."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
226
                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
227
228
229
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
230
231
                self.model = ""
                self.dgd_image = ""
232
233
234
235
236
237
238
239
240
241
242
243
244
245
                self.min_num_gpus_per_engine = 8
                self.max_num_gpus_per_engine = 32
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.aic_system = None
246
                self.aic_hf_id = None
247
248
                self.aic_backend = ""
                self.aic_backend_version = None
249
                self.num_gpus_per_node = 8
250
251
                self.search_strategy = SearchStrategy.THOROUGH
                self.system = ""
252
                self.deploy_after_profile = False
253
                self.pick_with_webui = False
254
255
256
257
                # Added in newer profiler versions; keep Args compatible with search_space_autogen
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
258
259
260
261
262
263
264
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
                    is_moe=True,
                    max_context_length=self.max_context_length,
                    num_experts=16,
                )
265
266
267
268

        return Args()

    @pytest.mark.pre_merge
269
    @pytest.mark.parallel
270
    @pytest.mark.asyncio
271
272
273
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
274
275
276
277
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_moe_args)
278
279
280
281
282
283
284
285
286
287
288
289
290
291

    # Example tests with mocked GPU inventory
    @pytest.fixture
    def mock_h100_gpu_info(self):
        """Mock GPU info for H100 80GB cluster."""
        return {
            "gpus_per_node": 8,
            "model": "h100_sxm",
            "vram": 81920,  # 80GB in MiB
        }

    @pytest.fixture
    def mock_model_info(self):
        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
292
293
294
295
296
297
        return ModelInfo(
            model_size=16384.0,  # 16GB model in MiB
            architecture="LlamaForCausalLM",
            is_moe=False,
            max_context_length=16384,
        )
298
299

    @pytest.fixture
300
    def vllm_args_with_model_autogen(self, request):
301
302
303
304
305
306
        """Create arguments for vllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "vllm"
                self.config = ""
307
308
309
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
310
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
311
                self.dgd_image = ""
312
313
314
                # Set to 0 to trigger auto-generation path
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
315
316
317
318
319
320
321
322
323
324
325
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
326
327
328
329
330
331
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
332
                self.deploy_after_profile = False
333
                self.pick_with_webui = False
334
335
336
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
337
338
339
340

        return Args()

    @pytest.mark.pre_merge
341
    @pytest.mark.parallel
342
    @pytest.mark.asyncio
343
344
345
    @pytest.mark.integration
    @pytest.mark.gpu_0
    @pytest.mark.vllm
346
    @patch("dynamo.profiler.utils.model_info.get_model_info")
347
348
349
350
351
352
353
354
355
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        vllm_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
356
357
        size and available GPU memory. GPU info is provided via command-line
        arguments injected by the Operator into the profiling config (DYN-2135).
358
        """
359
        # Configure the mock to return the appropriate model info
360
361
362
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
363
        # based on the model and GPU info from args
364
365
366
367
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

    @pytest.fixture
368
    def sglang_args_with_model_autogen(self, request):
369
370
371
372
373
374
        """Create arguments for sglang backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
                self.config = ""
375
376
377
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
378
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
379
                self.dgd_image = ""
380
381
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
382
383
384
385
386
387
388
389
390
391
392
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
393
394
395
396
397
398
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
399
                self.deploy_after_profile = False
400
                self.pick_with_webui = False
401
402
403
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
404
405
406
407

        return Args()

    @pytest.mark.pre_merge
408
    @pytest.mark.parallel
409
    @pytest.mark.asyncio
410
411
412
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
413
414
415
416
417
418
    @pytest.mark.skip(
        reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
        "is in legacy format missing 'gemm_dtype' field. "
        "See: KeyError in aiconfigurator/sdk/perf_database.py"
    )
    @patch("dynamo.profiler.utils.model_info.get_model_info")
419
420
421
422
423
424
425
426
427
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        sglang_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
428
429
430
431
432
433
        size and available GPU memory for sglang backend. GPU info is provided via
        command-line arguments injected by the Operator into the profiling config (DYN-2135).

        NOTE: Currently skipped due to AI Configurator database format issue.
        The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
        the required 'gemm_dtype' field, causing KeyError during database loading.
434
        """
435
        # Configure the mock to return the appropriate model info
436
437
438
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
439
        # based on the model and GPU info from args
440
441
442
443
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

    @pytest.fixture
444
    def trtllm_args_with_model_autogen(self, request):
445
446
447
448
449
450
        """Create arguments for trtllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "trtllm"
                self.config = ""
451
452
453
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
454
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
455
                self.dgd_image = ""
456
457
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
458
459
460
461
462
463
464
465
466
467
468
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
469
470
471
472
473
474
                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
                # GPU discovery values (auto-populated by Operator)
                self.num_gpus_per_node = 8
                self.gpu_model = "H100-SXM5-80GB"
                self.gpu_vram_mib = 81920
475
                self.deploy_after_profile = False
476
                self.pick_with_webui = False
477
478
479
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
480
481
482
483

        return Args()

    @pytest.mark.pre_merge
484
    @pytest.mark.parallel
485
    @pytest.mark.asyncio
486
487
488
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
489
    @patch("dynamo.profiler.utils.model_info.get_model_info")
490
491
492
493
494
495
496
497
498
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        trtllm_args_with_model_autogen,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
499
500
        size and available GPU memory for trtllm backend. GPU info is provided via
        command-line arguments injected by the Operator into the profiling config (DYN-2135).
501
        """
502
        # Configure the mock to return the appropriate model info
503
504
505
        mock_get_model_info.return_value = mock_model_info

        # Run the profile - the search space will be auto-generated
506
        # based on the model and GPU info from args
507
508
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576

    # Unit tests for search_strategy and system attributes
    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_vllm_args_has_search_strategy(self, vllm_args):
        """Test that vllm_args fixture has search_strategy attribute."""
        assert hasattr(vllm_args, "search_strategy")
        assert vllm_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(vllm_args, "system")
        assert vllm_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_sglang_args_has_search_strategy(self, sglang_args):
        """Test that sglang_args fixture has search_strategy attribute."""
        assert hasattr(sglang_args, "search_strategy")
        assert sglang_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(sglang_args, "system")
        assert sglang_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_trtllm_args_has_search_strategy(self, trtllm_args):
        """Test that trtllm_args fixture has search_strategy attribute."""
        assert hasattr(trtllm_args, "search_strategy")
        assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(trtllm_args, "system")
        assert trtllm_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
        """Test that sglang_moe_args fixture has search_strategy attribute."""
        assert hasattr(sglang_moe_args, "search_strategy")
        assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
        assert hasattr(sglang_moe_args, "system")
        assert sglang_moe_args.system == ""

    @pytest.mark.pre_merge
    @pytest.mark.unit
    @pytest.mark.gpu_0
    def test_model_autogen_args_have_rapid_strategy(
        self,
        vllm_args_with_model_autogen,
        sglang_args_with_model_autogen,
        trtllm_args_with_model_autogen,
    ):
        """Test that model autogen fixtures have RAPID search strategy and GPU info."""
        for args_fixture in [
            vllm_args_with_model_autogen,
            sglang_args_with_model_autogen,
            trtllm_args_with_model_autogen,
        ]:
            assert hasattr(args_fixture, "search_strategy")
            assert args_fixture.search_strategy == SearchStrategy.RAPID
            assert hasattr(args_fixture, "system")
            assert args_fixture.system == "h100_sxm"
            # Verify GPU discovery attributes
            assert hasattr(args_fixture, "num_gpus_per_node")
            assert args_fixture.num_gpus_per_node == 8
            assert hasattr(args_fixture, "gpu_model")
            assert args_fixture.gpu_model == "H100-SXM5-80GB"
            assert hasattr(args_fixture, "gpu_vram_mib")
            assert args_fixture.gpu_vram_mib == 81920