"tests/compile/distributed/test_fusion_all_reduce.py" did not exist on "881e1af43a1bb7b4bedd373e413eb7ad9dc9f920"
test_profile_sla_dryrun.py 20.2 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0

"""
Test suite for profile_sla dry-run functionality.

This test ensures that the profile_sla script can successfully run in dry-run mode
8
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
9
10
11
12
"""

import sys
from pathlib import Path
13
from unittest.mock import patch
14
15
16
17
18
19
20
21

import pytest

# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from benchmarks.profiler.profile_sla import run_profile  # noqa: E402
22
from benchmarks.profiler.utils.model_info import ModelInfo  # noqa: E402
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from benchmarks.profiler.utils.search_space_autogen import (  # noqa: E402
    auto_generate_search_space,
)


# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
    """Override the logger fixture to prevent test directory creation.

    This replaces the logger fixture from tests/conftest.py that creates
    directories named after each test.
    """
    # Simply do nothing - no directories created, no file handlers added
    yield
38
39
40
41
42
43


class TestProfileSLADryRun:
    """Test class for profile_sla dry-run functionality."""

    @pytest.fixture
44
    def vllm_args(self, request):
45
46
47
        """Create arguments for vllm backend dry-run test."""

        class Args:
48
49
            def __init__(self):
                self.backend = "vllm"
50
                self.config = "examples/backends/vllm/deploy/disagg.yaml"
51
52
53
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
54
55
                self.model = ""
                self.dgd_image = ""
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
71
                self.aic_hf_id = None
72
73
                self.aic_backend = ""
                self.aic_backend_version = None
74
                self.num_gpus_per_node = 8
75
                self.deploy_after_profile = False
76
                self.pick_with_webui = False
77
78
79
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
80
81
82
83
84
85
86
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
87
88
89
90

        return Args()

    @pytest.fixture
91
    def sglang_args(self, request):
92
93
94
        """Create arguments for sglang backend dry-run test."""

        class Args:
95
96
            def __init__(self):
                self.backend = "sglang"
97
                self.config = "examples/backends/sglang/deploy/disagg.yaml"
98
99
100
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
101
102
                self.model = ""
                self.dgd_image = ""
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
118
                self.aic_hf_id = None
119
120
                self.aic_backend = ""
                self.aic_backend_version = None
121
                self.num_gpus_per_node = 8
122
                self.deploy_after_profile = False
123
                self.pick_with_webui = False
124
125
126
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
127
128
129
130
131
132
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
133
134
135
136

        return Args()

    @pytest.mark.pre_merge
137
    @pytest.mark.parallel
138
    @pytest.mark.asyncio
139
140
141
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.vllm
142
143
144
145
146
147
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(vllm_args)

    @pytest.mark.pre_merge
148
    @pytest.mark.parallel
149
    @pytest.mark.asyncio
150
151
152
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
153
154
155
156
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_args)
157
158

    @pytest.fixture
159
    def trtllm_args(self, request):
160
161
162
        """Create arguments for trtllm backend dry-run test."""

        class Args:
163
164
            def __init__(self):
                self.backend = "trtllm"
165
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
166
167
168
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
169
170
                self.model = ""
                self.dgd_image = ""
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
186
                self.aic_hf_id = None
187
188
                self.aic_backend = ""
                self.aic_backend_version = None
189
                self.num_gpus_per_node = 8
190
                self.deploy_after_profile = False
191
                self.pick_with_webui = False
192
193
194
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
195
196
197
198
199
200
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
201
202
203
204

        return Args()

    @pytest.mark.pre_merge
205
    @pytest.mark.parallel
206
    @pytest.mark.asyncio
207
208
209
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
210
211
212
213
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
214
215

    @pytest.fixture
216
    def sglang_moe_args(self, request):
217
218
219
220
221
        """Create arguments for trtllm backend dry-run test."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
222
                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
223
224
225
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
226
227
                self.model = ""
                self.dgd_image = ""
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
                self.min_num_gpus_per_engine = 8
                self.max_num_gpus_per_engine = 32
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
243
                self.aic_hf_id = None
244
245
                self.aic_backend = ""
                self.aic_backend_version = None
246
                self.num_gpus_per_node = 8
247
                self.deploy_after_profile = False
248
                self.pick_with_webui = False
249
250
251
252
                # Added in newer profiler versions; keep Args compatible with search_space_autogen
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
253
254
255
256
257
258
259
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
                    is_moe=True,
                    max_context_length=self.max_context_length,
                    num_experts=16,
                )
260
261
262
263

        return Args()

    @pytest.mark.pre_merge
264
    @pytest.mark.parallel
265
    @pytest.mark.asyncio
266
267
268
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
269
270
271
272
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_moe_args)
273
274
275
276
277
278
279
280
281
282
283
284
285
286

    # Example tests with mocked GPU inventory
    @pytest.fixture
    def mock_h100_gpu_info(self):
        """Mock GPU info for H100 80GB cluster."""
        return {
            "gpus_per_node": 8,
            "model": "h100_sxm",
            "vram": 81920,  # 80GB in MiB
        }

    @pytest.fixture
    def mock_model_info(self):
        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
287
288
289
290
291
292
        return ModelInfo(
            model_size=16384.0,  # 16GB model in MiB
            architecture="LlamaForCausalLM",
            is_moe=False,
            max_context_length=16384,
        )
293
294

    @pytest.fixture
295
    def vllm_args_with_model_autogen(self, request):
296
297
298
299
300
301
        """Create arguments for vllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "vllm"
                self.config = ""
302
303
304
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
305
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
306
                self.dgd_image = ""
307
308
309
                # Set to 0 to trigger auto-generation path
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
310
311
312
313
314
315
316
317
318
319
320
321
322
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
323
                self.aic_hf_id = None
324
325
                self.aic_backend = ""
                self.aic_backend_version = None
326
327
                # Set to 0 to trigger auto-generation path
                self.num_gpus_per_node = 0
328
                self.deploy_after_profile = False
329
                self.pick_with_webui = False
330
                self.enable_gpu_discovery = True
331
332
333
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
334
335
336
337

        return Args()

    @pytest.mark.pre_merge
338
    @pytest.mark.parallel
339
    @pytest.mark.asyncio
340
341
342
    @pytest.mark.integration
    @pytest.mark.gpu_0
    @pytest.mark.vllm
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        vllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

    @pytest.fixture
368
    def sglang_args_with_model_autogen(self, request):
369
370
371
372
373
374
        """Create arguments for sglang backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
                self.config = ""
375
376
377
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
378
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
379
                self.dgd_image = ""
380
381
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
382
383
384
385
386
387
388
389
390
391
392
393
394
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
395
                self.aic_hf_id = None
396
397
                self.aic_backend = ""
                self.aic_backend_version = None
398
                self.num_gpus_per_node = 0
399
                self.deploy_after_profile = False
400
                self.pick_with_webui = False
401
                self.enable_gpu_discovery = True
402
403
404
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
405
406
407
408

        return Args()

    @pytest.mark.pre_merge
409
    @pytest.mark.parallel
410
    @pytest.mark.asyncio
411
412
413
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        sglang_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for sglang backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

    @pytest.fixture
439
    def trtllm_args_with_model_autogen(self, request):
440
441
442
443
444
445
        """Create arguments for trtllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "trtllm"
                self.config = ""
446
447
448
                # Use unique output directory per test for parallel execution
                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
                self.namespace = f"test-namespace-{request.node.name}"
449
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
450
                self.dgd_image = ""
451
452
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
453
454
455
456
457
458
459
460
461
462
463
464
465
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
466
                self.aic_hf_id = None
467
468
                self.aic_backend = ""
                self.aic_backend_version = None
469
                self.num_gpus_per_node = 0
470
                self.deploy_after_profile = False
471
                self.pick_with_webui = False
472
                self.enable_gpu_discovery = True
473
474
475
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
476
477
478
479

        return Args()

    @pytest.mark.pre_merge
480
    @pytest.mark.parallel
481
    @pytest.mark.asyncio
482
483
484
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        trtllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for trtllm backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)