test_profile_sla_dryrun.py 16.2 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Test suite for profile_sla dry-run functionality.

This test ensures that the profile_sla script can successfully run in dry-run mode
8
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
9
10
11
12
"""

import sys
from pathlib import Path
13
from unittest.mock import patch
14
15
16
17
18
19
20
21

import pytest

# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from benchmarks.profiler.profile_sla import run_profile  # noqa: E402
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from benchmarks.profiler.utils.search_space_autogen import (  # noqa: E402
    auto_generate_search_space,
)


# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
    """Override the logger fixture to prevent test directory creation.

    This replaces the logger fixture from tests/conftest.py that creates
    directories named after each test.
    """
    # Simply do nothing - no directories created, no file handlers added
    yield
37
38
39
40
41
42
43
44
45
46


class TestProfileSLADryRun:
    """Test class for profile_sla dry-run functionality."""

    @pytest.fixture
    def vllm_args(self):
        """Create arguments for vllm backend dry-run test."""

        class Args:
47
48
49
50
51
            def __init__(self):
                self.backend = "vllm"
                self.config = "components/backends/vllm/deploy/disagg.yaml"
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
52
53
                self.model = ""
                self.dgd_image = ""
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
71
72
                self.aic_backend = ""
                self.aic_backend_version = None
73
                self.num_gpus_per_node = 8
74
                self.deploy_after_profile = False
75
76
77
78
79
80
81
82

        return Args()

    @pytest.fixture
    def sglang_args(self):
        """Create arguments for sglang backend dry-run test."""

        class Args:
83
84
85
86
87
            def __init__(self):
                self.backend = "sglang"
                self.config = "components/backends/sglang/deploy/disagg.yaml"
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
88
89
                self.model = ""
                self.dgd_image = ""
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
107
108
                self.aic_backend = ""
                self.aic_backend_version = None
109
                self.num_gpus_per_node = 8
110
                self.deploy_after_profile = False
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(vllm_args)

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_args)
127
128
129
130
131
132

    @pytest.fixture
    def trtllm_args(self):
        """Create arguments for trtllm backend dry-run test."""

        class Args:
133
134
135
136
137
            def __init__(self):
                self.backend = "trtllm"
                self.config = "components/backends/trtllm/deploy/disagg.yaml"
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
138
139
                self.model = ""
                self.dgd_image = ""
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
                self.min_num_gpus_per_engine = 1
                self.max_num_gpus_per_engine = 8
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
157
158
                self.aic_backend = ""
                self.aic_backend_version = None
159
                self.num_gpus_per_node = 8
160
                self.deploy_after_profile = False
161
162
163
164
165
166
167
168
169

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
170
171
172
173
174
175
176
177

    @pytest.fixture
    def sglang_moe_args(self):
        """Create arguments for trtllm backend dry-run test."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
178
                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
179
180
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
181
182
                self.model = ""
                self.dgd_image = ""
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
                self.min_num_gpus_per_engine = 8
                self.max_num_gpus_per_engine = 32
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 16384
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = True
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
200
201
                self.aic_backend = ""
                self.aic_backend_version = None
202
                self.num_gpus_per_node = 8
203
                self.deploy_after_profile = False
204
205
206
207
208
209
210
211
212

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(sglang_moe_args)
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

    # Example tests with mocked GPU inventory
    @pytest.fixture
    def mock_h100_gpu_info(self):
        """Mock GPU info for H100 80GB cluster."""
        return {
            "gpus_per_node": 8,
            "model": "h100_sxm",
            "vram": 81920,  # 80GB in MiB
        }

    @pytest.fixture
    def mock_model_info(self):
        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
        return {
            "model_size": 16384,  # 16GB model in MiB
            "is_moe": False,
            "max_context_length": 16384,  # 16K tokens
        }

    @pytest.fixture
    def vllm_args_with_model_autogen(self):
        """Create arguments for vllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "vllm"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
244
                self.dgd_image = ""
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
                self.min_num_gpus_per_engine = 0  # Will be auto-generated
                self.max_num_gpus_per_engine = 0  # Will be auto-generated
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = None  # Will be auto-generated
                self.deploy_after_profile = False

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        vllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

    @pytest.fixture
    def sglang_args_with_model_autogen(self):
        """Create arguments for sglang backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "sglang"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
306
                self.dgd_image = ""
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
                self.min_num_gpus_per_engine = 0  # Will be auto-generated
                self.max_num_gpus_per_engine = 0  # Will be auto-generated
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = None  # Will be auto-generated
                self.deploy_after_profile = False

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        sglang_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for sglang backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

    @pytest.fixture
    def trtllm_args_with_model_autogen(self):
        """Create arguments for trtllm backend with model-based search space autogeneration."""

        class Args:
            def __init__(self):
                self.backend = "trtllm"
                self.config = ""
                self.output_dir = "/tmp/test_profiling_results"
                self.namespace = "test-namespace"
                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
368
                self.dgd_image = ""
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
                self.min_num_gpus_per_engine = 0  # Will be auto-generated
                self.max_num_gpus_per_engine = 0  # Will be auto-generated
                self.skip_existing_results = False
                self.force_rerun = False
                self.isl = 3000
                self.osl = 500
                self.ttft = 50
                self.itl = 10
                self.max_context_length = 0
                self.prefill_interpolation_granularity = 16
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.is_moe_model = False
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_model_name = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = None  # Will be auto-generated
                self.deploy_after_profile = False

        return Args()

    @pytest.mark.pre_merge
    @pytest.mark.asyncio
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
        mock_get_gpu_summary,
        trtllm_args_with_model_autogen,
        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
        size and available GPU memory for trtllm backend.
        """
        # Configure the mocks to return the appropriate info
        mock_get_model_info.return_value = mock_model_info
        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
        # based on the model and mocked GPU info
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)