test_profile_sla_dryrun.py 17.1 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Test suite for profile_sla dry-run functionality.

This test ensures that the profile_sla script can successfully run in dry-run mode
8
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
9
10
11
12
"""

import sys
from pathlib import Path
13
from unittest.mock import patch
14
15
16
17
18
19
20
21

import pytest

# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from benchmarks.profiler.profile_sla import run_profile  # noqa: E402
22
from benchmarks.profiler.utils.model_info import ModelInfo  # noqa: E402
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from benchmarks.profiler.utils.search_space_autogen import (  # noqa: E402
    auto_generate_search_space,
)


# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
    """Override the logger fixture to prevent test directory creation.

    This replaces the logger fixture from tests/conftest.py that creates
    directories named after each test.
    """
    # Simply do nothing - no directories created, no file handlers added
    yield
38
39
40
41
42
43
44
45
46
47


class TestProfileSLADryRun:
    """Test class for profile_sla dry-run functionality."""

    @pytest.fixture
    def vllm_args(self):
        """Create arguments for vllm backend dry-run test."""

        class Args:
48
49
            def __init__(self):
                self.backend = "vllm"
50
                self.config = "examples/backends/vllm/deploy/disagg.yaml"
51
52
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
53
54
                self.model = ""
                self.dgd_image = ""
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
70
                self.aic_hf_id = None
71
72
                self.aic_backend = ""
                self.aic_backend_version = None
73
                self.num_gpus_per_node = 8
74
                self.deploy_after_profile = False
75
76
77
78
79
80
81
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
82
83
84
85
86
87
88
89

        return Args()

    @pytest.fixture
    def sglang_args(self):
        """Create arguments for sglang backend dry-run test."""

        class Args:
90
91
            def __init__(self):
                self.backend = "sglang"
92
                self.config = "examples/backends/sglang/deploy/disagg.yaml"
93
94
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
95
96
                self.model = ""
                self.dgd_image = ""
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
112
                self.aic_hf_id = None
113
114
                self.aic_backend = ""
                self.aic_backend_version = None
115
                self.num_gpus_per_node = 8
116
                self.deploy_after_profile = False
117
118
119
120
121
122
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(vllm_args)

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_args)
139
140
141
142
143
144

    @pytest.fixture
    def trtllm_args(self):
        """Create arguments for trtllm backend dry-run test."""

        class Args:
145
146
            def __init__(self):
                self.backend = "trtllm"
147
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
148
149
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
150
151
                self.model = ""
                self.dgd_image = ""
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
167
                self.aic_hf_id = None
168
169
                self.aic_backend = ""
                self.aic_backend_version = None
170
                self.num_gpus_per_node = 8
171
                self.deploy_after_profile = False
172
173
174
175
176
177
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
                    is_moe=False,
                    max_context_length=self.max_context_length,
                )
178
179
180
181
182
183
184
185
186

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
187
188
189
190
191
192
193
194

    @pytest.fixture
    def sglang_moe_args(self):
        """Create arguments for trtllm backend dry-run test."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
195
                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
196
197
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
198
199
                self.model = ""
                self.dgd_image = ""
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
                self.min_num_gpus_per_engine = 8
                self.max_num_gpus_per_engine = 32
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
215
                self.aic_hf_id = None
216
217
                self.aic_backend = ""
                self.aic_backend_version = None
218
                self.num_gpus_per_node = 8
219
                self.deploy_after_profile = False
220
221
222
223
224
225
226
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
                    is_moe=True,
                    max_context_length=self.max_context_length,
                    num_experts=16,
                )
227
228
229
230
231
232
233
234
235

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_moe_args)
236
237
238
239
240
241
242
243
244
245
246
247
248
249

    # Example tests with mocked GPU inventory
    @pytest.fixture
    def mock_h100_gpu_info(self):
        """Mock GPU info for H100 80GB cluster."""
        return {
            "gpus_per_node": 8,
            "model": "h100_sxm",
            "vram": 81920,  # 80GB in MiB
        }

    @pytest.fixture
    def mock_model_info(self):
        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
250
251
252
253
254
255
        return ModelInfo(
            model_size=16384.0,  # 16GB model in MiB
            architecture="LlamaForCausalLM",
            is_moe=False,
            max_context_length=16384,
        )
256
257
258
259
260
261
262
263
264
265
266
267

    @pytest.fixture
    def vllm_args_with_model_autogen(self):
        """Create arguments for vllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "vllm"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
268
                self.dgd_image = ""
269
270
271
                # Set to 0 to trigger auto-generation path
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
272
273
274
275
276
277
278
279
280
281
282
283
284
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
285
                self.aic_hf_id = None
286
287
                self.aic_backend = ""
                self.aic_backend_version = None
288
289
                # Set to 0 to trigger auto-generation path
                self.num_gpus_per_node = 0
290
                self.deploy_after_profile = False
291
                self.enable_gpu_discovery = True
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        vllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

    @pytest.fixture
    def sglang_args_with_model_autogen(self):
        """Create arguments for sglang backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
332
                self.dgd_image = ""
333
334
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
335
336
337
338
339
340
341
342
343
344
345
346
347
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
348
                self.aic_hf_id = None
349
350
                self.aic_backend = ""
                self.aic_backend_version = None
351
                self.num_gpus_per_node = 0
352
                self.deploy_after_profile = False
353
                self.enable_gpu_discovery = True
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        sglang_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for sglang backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

    @pytest.fixture
    def trtllm_args_with_model_autogen(self):
        """Create arguments for trtllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "trtllm"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
394
                self.dgd_image = ""
395
396
                self.min_num_gpus_per_engine = 0
                self.max_num_gpus_per_engine = 0
397
398
399
400
401
402
403
404
405
406
407
408
409
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
410
                self.aic_hf_id = None
411
412
                self.aic_backend = ""
                self.aic_backend_version = None
413
                self.num_gpus_per_node = 0
414
                self.deploy_after_profile = False
415
                self.enable_gpu_discovery = True
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        trtllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for trtllm backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)