Unverified Commit 4c648b11 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

refactor: move core logics of DPP -> AIC and support static profiling (#6285)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent f6d4351f
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 9a: Thorough sweep for DeepSeek-R1 on SGLang with DGD overrides.
# Overrides are derived by comparing the base sglang disagg template
# (examples/backends/sglang/deploy/disagg.yaml) with the DSR1 recipe
# (recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml), excluding
# parallelization-mapping fields (--tp, --dp, --ep-size,
# --enable-dp-attention) which are swept by the profiler.
model: "deepseek-ai/DeepSeek-R1"
backend: sglang
image: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.8.0"
hardware:
gpuSku: h200_sxm
totalGpus: 16
numGpusPerNode: 8
modelCache:
pvcName: model-cache
pvcMountPath: /opt/model
workload:
isl: 4000
osl: 1000
sla:
ttft: 2000.0
itl: 50.0
searchStrategy: thorough
overrides:
dgd:
spec:
services:
decode:
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
workingDir: /workspace
args:
- --mem-fraction-static
- "0.75"
- --prefill-round-robin-balance
- --watchdog-timeout
- "3600"
prefill:
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
workingDir: /workspace
args:
- --mem-fraction-static
- "0.75"
- --load-balance-method
- round_robin
- --watchdog-timeout
- "3600"
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for profile_sla.py private helper functions.
These tests exercise each helper in isolation, without running the full
profiling pipeline. External I/O (DGD generation, deployment) is mocked
where needed.
"""
import os
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
import yaml
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.planner.utils.planner_config import (
PlannerConfig,
PlannerPreDeploymentSweepMode,
)
from dynamo.profiler.profile_sla import (
_assemble_final_config,
_extract_profiler_params,
_write_final_output,
)
from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
PickedParallelConfig,
)
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
FeaturesSpec,
HardwareSpec,
MockerSpec,
SLASpec,
WorkloadSpec,
)
from dynamo.profiler.utils.dgdr_validate import run_gate_checks
from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
except ImportError as e:
pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
# ---------------------------------------------------------------------------
# Shared fixtures
# ---------------------------------------------------------------------------
def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
"""Build a minimal dgdr with all required fields set."""
base = dict(
model="Qwen/Qwen3-32B",
backend="trtllm",
image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
workload=WorkloadSpec(isl=4000, osl=1000),
sla=SLASpec(ttft=2000.0, itl=50.0),
)
base.update(overrides)
return DynamoGraphDeploymentRequestSpec(**base)
def _make_planner(**overrides) -> PlannerConfig:
base = dict(
enable_throughput_scaling=True,
enable_load_scaling=False,
pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
mode="disagg",
backend="trtllm",
)
base.update(overrides)
return PlannerConfig(**base)
def _make_ops(tmp_path, **kwargs) -> ProfilerOperationalConfig:
return ProfilerOperationalConfig(
output_dir=str(tmp_path / "out"),
**kwargs,
)
# ---------------------------------------------------------------------------
# _extract_profiler_params
# ---------------------------------------------------------------------------
class TestExtractProfilerParams:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_basic_ttft_itl(self):
"""Returns correct values when ttft/itl SLA is used."""
dgdr = _make_dgdr()
(
model,
backend,
system,
total_gpus,
isl,
osl,
req_lat,
ttft,
tpot,
strategy,
picking,
) = _extract_profiler_params(dgdr)
assert model == "Qwen/Qwen3-32B"
assert backend == "trtllm"
assert system == "h200_sxm"
assert total_gpus == 8
assert isl == 4000
assert osl == 1000
assert req_lat is None
assert ttft == 2000.0
assert tpot == 50.0
assert strategy == SearchStrategy.RAPID
assert picking == "default"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_e2e_latency_sets_both_targets(self):
"""Both ttft and tpot equal e2eLatency when it is set."""
dgdr = _make_dgdr(sla=SLASpec(ttft=None, itl=None, e2eLatency=35000.0))
_, _, _, _, _, _, req_lat, ttft, tpot, _, _ = _extract_profiler_params(dgdr)
assert req_lat == 35000.0
assert ttft == 35000.0
assert tpot == 35000.0
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_request_rate_yields_load_match_picking(self):
"""requestRate present in workload → picking_mode == 'load_match'."""
dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
_, _, _, _, _, _, _, _, _, _, picking = _extract_profiler_params(dgdr)
assert picking == "load_match"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_backend_lowercased(self):
"""backend value is always lower-cased."""
dgdr = _make_dgdr(backend="trtllm")
_, backend, _, _, _, _, _, _, _, _, _ = _extract_profiler_params(dgdr)
assert backend == "trtllm"
assert backend == backend.lower()
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_strategy_extracted(self):
"""searchStrategy: thorough is correctly reflected in the returned tuple."""
dgdr = _make_dgdr(searchStrategy="thorough")
_, _, _, _, _, _, _, _, _, strategy, _ = _extract_profiler_params(dgdr)
assert strategy == SearchStrategy.THOROUGH
# ---------------------------------------------------------------------------
# run_gate_checks
# ---------------------------------------------------------------------------
class TestRunGateChecks:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_auto_backend_raises(self):
"""THOROUGH + 'auto' backend is rejected."""
dgdr = _make_dgdr()
with pytest.raises(ValueError, match="does not support 'auto' backend"):
run_gate_checks(
dgdr,
aic_supported=True,
search_strategy=SearchStrategy.THOROUGH,
backend="auto",
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_concrete_backend_passes(self):
"""THOROUGH + concrete backend is fine."""
dgdr = _make_dgdr()
run_gate_checks(
dgdr,
aic_supported=True,
search_strategy=SearchStrategy.THOROUGH,
backend="trtllm",
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_rapid_auto_backend_passes(self):
"""RAPID allows 'auto' backend."""
dgdr = _make_dgdr()
run_gate_checks(
dgdr,
aic_supported=False,
search_strategy=SearchStrategy.RAPID,
backend="auto",
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_aic_unsupported_passes(self):
"""No planner, AIC unsupported — no error."""
dgdr = _make_dgdr()
run_gate_checks(
dgdr,
aic_supported=False,
search_strategy=SearchStrategy.RAPID,
backend="vllm",
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_throughput_scaling_aic_unsupported_raises(self):
"""Throughput-based planner scaling requires AIC support."""
dgdr = _make_dgdr(
features=FeaturesSpec(
planner=_make_planner(
enable_throughput_scaling=True,
backend="vllm",
)
)
)
with pytest.raises(
ValueError, match="Throughput-based planner scaling requires AIC support"
):
run_gate_checks(
dgdr,
aic_supported=False,
search_strategy=SearchStrategy.RAPID,
backend="vllm",
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_rapid_sweep_aic_unsupported_mutates_to_none(self):
"""Rapid pre-deployment sweep falls back to None when AIC is unsupported."""
planner = _make_planner(
enable_throughput_scaling=False,
enable_load_scaling=True,
pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
backend="vllm",
)
dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
run_gate_checks(
dgdr,
aic_supported=False,
search_strategy=SearchStrategy.RAPID,
backend="vllm",
)
assert (
dgdr.features.planner.pre_deployment_sweeping_mode
== PlannerPreDeploymentSweepMode.None_
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_aic_supported_no_mutation(self):
"""When AIC is supported, planner config is left unchanged."""
planner = _make_planner(
pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
)
dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
run_gate_checks(
dgdr,
aic_supported=True,
search_strategy=SearchStrategy.RAPID,
backend="trtllm",
)
assert (
dgdr.features.planner.pre_deployment_sweeping_mode
== PlannerPreDeploymentSweepMode.Rapid
)
# ---------------------------------------------------------------------------
# _write_final_output
# ---------------------------------------------------------------------------
class TestWriteFinalOutput:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_normal_config_writes_file_and_returns_true(self, tmp_path):
ops = _make_ops(tmp_path)
os.makedirs(ops.output_dir, exist_ok=True)
final_config = {"apiVersion": "v1", "kind": "Deployment"}
result = _write_final_output(ops, final_config)
assert result is True
out = Path(ops.output_dir) / "final_config.yaml"
assert out.exists()
assert yaml.safe_load(out.read_text()) == final_config
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_list_config_writes_multi_doc_yaml(self, tmp_path):
ops = _make_ops(tmp_path)
os.makedirs(ops.output_dir, exist_ok=True)
final_config = [{"kind": "A"}, {"kind": "B"}]
result = _write_final_output(ops, final_config)
assert result is True
out = Path(ops.output_dir) / "final_config.yaml"
docs = list(yaml.safe_load_all(out.read_text()))
assert len(docs) == 2
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_none_config_not_dry_run_returns_false(self, tmp_path):
ops = _make_ops(tmp_path, dry_run=False)
os.makedirs(ops.output_dir, exist_ok=True)
result = _write_final_output(ops, None)
assert result is False
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_none_config_dry_run_writes_empty_yaml_and_returns_true(self, tmp_path):
ops = _make_ops(tmp_path, dry_run=True)
os.makedirs(ops.output_dir, exist_ok=True)
result = _write_final_output(ops, None)
assert result is True
out = Path(ops.output_dir) / "final_config.yaml"
assert out.exists()
assert yaml.safe_load(out.read_text()) is None # empty YAML == None
# ---------------------------------------------------------------------------
# _assemble_final_config
# ---------------------------------------------------------------------------
class TestAssembleFinalConfig:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_no_mocker_returns_dgd_config_unchanged(self, tmp_path):
dgdr = _make_dgdr()
ops = _make_ops(tmp_path)
dgd_config = {"kind": "DynamoGraphDeployment"}
result = _assemble_final_config(
dgdr,
ops,
dgd_config,
PickedParallelConfig(tp=1),
PickedParallelConfig(tp=1),
)
assert result is dgd_config
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_none_dgd_config_passes_through_as_none(self, tmp_path):
dgdr = _make_dgdr()
ops = _make_ops(tmp_path)
result = _assemble_final_config(
dgdr,
ops,
None,
PickedParallelConfig(tp=1),
PickedParallelConfig(tp=1),
)
assert result is None
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_no_mocker_returns_real_config(self, tmp_path):
dgdr = _make_dgdr(features=FeaturesSpec(planner=_make_planner()))
ops = _make_ops(tmp_path)
os.makedirs(ops.output_dir, exist_ok=True)
dgd_config = {"kind": "DGD"}
real_cfg = {"kind": "real"}
mocker_cfg = {"kind": "mocker"}
with patch(
"dynamo.profiler.profile_sla.generate_dgd_config_with_planner",
return_value=(real_cfg, mocker_cfg),
):
result = _assemble_final_config(
dgdr,
ops,
dgd_config,
PickedParallelConfig(tp=1),
PickedParallelConfig(tp=1),
)
assert result is real_cfg
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_enabled_returns_mocker_config(self, tmp_path):
dgdr = _make_dgdr(
features=FeaturesSpec(
planner=_make_planner(),
mocker=MockerSpec(enabled=True),
)
)
ops = _make_ops(tmp_path)
os.makedirs(ops.output_dir, exist_ok=True)
dgd_config = {"kind": "DGD"}
real_cfg = {"kind": "real"}
mocker_cfg = {"kind": "mocker"}
with patch(
"dynamo.profiler.profile_sla.generate_dgd_config_with_planner",
return_value=(real_cfg, mocker_cfg),
):
result = _assemble_final_config(
dgdr,
ops,
dgd_config,
PickedParallelConfig(tp=1),
PickedParallelConfig(tp=1),
)
assert result is mocker_cfg
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for rapid.py private helper functions.
Tests _run_naive_fallback and _run_default_sim in isolation; AIC simulation
helpers (_run_autoscale_sim) require the full AIC stack and are covered by
the end-to-end test suite.
"""
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pandas as pd
import pytest
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.profiler.rapid import _run_default_sim, _run_naive_fallback
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
HardwareSpec,
ModelCacheSpec,
SLASpec,
WorkloadSpec,
)
except ImportError as e:
pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
# ---------------------------------------------------------------------------
# Shared fixtures
# ---------------------------------------------------------------------------
def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
base = dict(
model="Qwen/Qwen3-32B",
backend="vllm",
image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
hardware=HardwareSpec(gpuSku="l40s", totalGpus=4, numGpusPerNode=4),
workload=WorkloadSpec(isl=4000, osl=1000),
sla=SLASpec(ttft=2000.0, itl=50.0),
)
base.update(overrides)
return DynamoGraphDeploymentRequestSpec(**base)
def _fake_modifier(update_image_return=None):
m = MagicMock()
m.update_image.return_value = update_image_return or {"kind": "DGD"}
m.update_model_from_pvc.return_value = {"kind": "DGD"}
return m
# ---------------------------------------------------------------------------
# _run_naive_fallback
# ---------------------------------------------------------------------------
class TestRunNaiveFallback:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_returns_expected_structure(self):
"""Result always has the four required keys with zeroed latencies."""
dgdr = _make_dgdr()
with patch(
"dynamo.profiler.rapid.generate_naive_config",
return_value={"artifacts": {}},
):
result = _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
assert set(result) >= {
"best_config_df",
"best_latencies",
"dgd_config",
"chosen_exp",
}
assert result["best_latencies"] == {
"ttft": 0.0,
"tpot": 0.0,
"request_latency": 0.0,
}
assert result["chosen_exp"] is None
assert isinstance(result["best_config_df"], pd.DataFrame)
assert result["best_config_df"].empty
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_empty_artifacts_yields_none_dgd_config(self):
"""No k8s_deploy.yaml in artifacts → dgd_config is None."""
dgdr = _make_dgdr()
with patch(
"dynamo.profiler.rapid.generate_naive_config",
return_value={"artifacts": {}},
):
result = _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
assert result["dgd_config"] is None
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_with_pvc_calls_update_model_from_pvc(self):
"""When modelCache.pvcName is set, update_model_from_pvc is called."""
dgdr = _make_dgdr(
modelCache=ModelCacheSpec(
pvcName="model-cache",
pvcModelPath="/model/qwen",
pvcMountPath="/opt/model-cache",
)
)
fake_modifier = _fake_modifier()
with (
patch(
"dynamo.profiler.rapid.generate_naive_config",
return_value={"artifacts": {"k8s_deploy.yaml": "kind: DGD"}},
),
patch("dynamo.profiler.rapid.CONFIG_MODIFIERS", {"vllm": fake_modifier}),
):
_run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
fake_modifier.update_model_from_pvc.assert_called_once()
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_without_pvc_skips_update_model_from_pvc(self):
"""When no modelCache, update_model_from_pvc is not called."""
dgdr = _make_dgdr()
fake_modifier = _fake_modifier()
with (
patch(
"dynamo.profiler.rapid.generate_naive_config",
return_value={"artifacts": {"k8s_deploy.yaml": "kind: DGD"}},
),
patch("dynamo.profiler.rapid.CONFIG_MODIFIERS", {"vllm": fake_modifier}),
):
_run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
fake_modifier.update_model_from_pvc.assert_not_called()
# ---------------------------------------------------------------------------
# _run_default_sim
# ---------------------------------------------------------------------------
class TestRunDefaultSim:
def _execute_return(self, chosen="disagg", ttft=100.0, tpot=10.0):
"""Build a fake _execute_task_configs return value."""
best_df = pd.DataFrame([{"tp(p)": 1}])
latencies = {"ttft": ttft, "tpot": tpot, "request_latency": 0.0}
return chosen, {chosen: best_df}, None, None, {chosen: latencies}
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_returns_required_keys(self):
dgdr = _make_dgdr()
with (
patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
patch(
"dynamo.profiler.rapid._execute_task_configs",
return_value=self._execute_return(),
),
patch(
"dynamo.profiler.rapid._generate_dgd_from_pick",
return_value={"kind": "DGD"},
),
):
result = _run_default_sim(
dgdr,
"Qwen/Qwen3-32B",
"h200_sxm",
"trtllm",
8,
4000,
1000,
2000.0,
50.0,
None,
"default",
)
assert set(result) >= {
"best_config_df",
"best_latencies",
"dgd_config",
"chosen_exp",
"task_configs",
}
assert result["chosen_exp"] == "disagg"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_load_match_passes_load_kwargs(self):
"""load_match picking mode forwards rate/concurrency/max_gpus to execute."""
dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
captured: dict = {}
def fake_execute(task_configs, mode, top_n, **kwargs):
captured.update(kwargs)
return self._execute_return()
with (
patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
patch(
"dynamo.profiler.rapid._execute_task_configs", side_effect=fake_execute
),
patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
):
_run_default_sim(
dgdr,
"Qwen/Qwen3-32B",
"h200_sxm",
"trtllm",
8,
4000,
1000,
2000.0,
50.0,
None,
"load_match",
)
assert "target_request_rate" in captured
assert captured["target_request_rate"] == 5.0
assert captured["max_total_gpus"] == 8
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_default_mode_passes_no_load_kwargs(self):
"""default picking mode does not forward load-match kwargs."""
dgdr = _make_dgdr()
captured: dict = {}
def fake_execute(task_configs, mode, top_n, **kwargs):
captured.update(kwargs)
return self._execute_return()
with (
patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
patch(
"dynamo.profiler.rapid._execute_task_configs", side_effect=fake_execute
),
patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
):
_run_default_sim(
dgdr,
"Qwen/Qwen3-32B",
"h200_sxm",
"trtllm",
8,
4000,
1000,
2000.0,
50.0,
None,
"default",
)
assert "target_request_rate" not in captured
assert "max_total_gpus" not in captured
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_latencies_extracted_from_chosen_exp(self):
"""best_latencies come from the chosen experiment's entry."""
dgdr = _make_dgdr()
with (
patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
patch(
"dynamo.profiler.rapid._execute_task_configs",
return_value=self._execute_return(ttft=123.0, tpot=7.0),
),
patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
):
result = _run_default_sim(
dgdr,
"Qwen/Qwen3-32B",
"h200_sxm",
"trtllm",
8,
4000,
1000,
2000.0,
50.0,
None,
"default",
)
assert result["best_latencies"]["ttft"] == 123.0
assert result["best_latencies"]["tpot"] == 7.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for thorough.py's _pick_thorough_best_config helper.
Benchmarking helpers (_benchmark_prefill_candidates, _benchmark_decode_candidates)
require live K8s deployments and are covered by the mocked end-to-end tests
in test_profile_sla_dgdr.py.
"""
import sys
from pathlib import Path
from unittest.mock import patch
import pandas as pd
import pytest
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.profiler.thorough import _pick_thorough_best_config
from dynamo.profiler.utils.aic_dataframe import build_decode_row, build_prefill_row
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
HardwareSpec,
SLASpec,
WorkloadSpec,
)
except ImportError as e:
pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
# ---------------------------------------------------------------------------
# Shared fixtures
# ---------------------------------------------------------------------------
def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
base = dict(
model="Qwen/Qwen3-32B",
backend="trtllm",
image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
workload=WorkloadSpec(isl=4000, osl=1000),
sla=SLASpec(ttft=2000.0, itl=50.0),
)
base.update(overrides)
return DynamoGraphDeploymentRequestSpec(**base)
def _stub_dfs():
"""Minimal prefill/decode DataFrames that satisfy pick function inputs.
Uses build_prefill_row / build_decode_row so the DataFrames contain all
columns expected by _build_disagg_summary_dict (called via
build_disagg_df_from_static in load_match / default paths).
"""
prefill_row = build_prefill_row(
model="Qwen/Qwen3-32B",
isl=4000,
osl=1000,
ttft=50.0,
tp=1,
pp=1,
dp=1,
moe_tp=1,
moe_ep=1,
backend="trtllm",
system="h200_sxm",
)
decode_row = build_decode_row(
tpot=10.0,
thpt_per_gpu=100.0,
num_request=1,
num_gpus=1,
osl=1000,
tp=1,
pp=1,
dp=1,
moe_tp=1,
moe_ep=1,
backend="trtllm",
system="h200_sxm",
)
prefill_df = pd.DataFrame([prefill_row])
decode_df = pd.DataFrame([decode_row])
return prefill_df, decode_df
def _mock_result():
return {
"best_config_df": pd.DataFrame(),
"best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
}
# ---------------------------------------------------------------------------
# _pick_thorough_best_config
# ---------------------------------------------------------------------------
class TestPickThoroughBestConfig:
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_autoscale_calls_pick_autoscale(self):
"""autoscale mode delegates to pick_autoscale with ttft/tpot targets."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr()
mock_result = _mock_result()
with patch(
"dynamo.profiler.thorough.pick_autoscale", return_value=mock_result
) as mock_pick:
result = _pick_thorough_best_config(
prefill_df,
decode_df,
"autoscale",
2000.0,
50.0,
None,
8,
dgdr,
)
mock_pick.assert_called_once_with(prefill_df, decode_df, 2000.0, 50.0)
assert result is mock_result
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_load_match_uses_request_latency_when_set(self):
"""load_match passes target_request_latency when request_latency is provided."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
with patch(
"dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
) as mock_pick:
_pick_thorough_best_config(
prefill_df,
decode_df,
"load_match",
2000.0,
50.0,
35000.0,
8,
dgdr,
)
kwargs = mock_pick.call_args.kwargs
assert kwargs["target_request_latency"] == 35000.0
assert "target_tpot" not in kwargs
assert kwargs["target_request_rate"] == 5.0
assert kwargs["max_total_gpus"] == 8
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_load_match_falls_back_to_target_tpot(self):
"""load_match passes target_tpot when no request_latency."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr()
with patch(
"dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
) as mock_pick:
_pick_thorough_best_config(
prefill_df,
decode_df,
"load_match",
2000.0,
50.0,
None,
8,
dgdr,
)
kwargs = mock_pick.call_args.kwargs
assert kwargs["target_tpot"] == 50.0
assert "target_request_latency" not in kwargs
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_default_uses_request_latency_when_set(self):
"""default mode passes target_request_latency when provided."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr()
with patch(
"dynamo.profiler.thorough.pick_default", return_value=_mock_result()
) as mock_pick:
_pick_thorough_best_config(
prefill_df,
decode_df,
"default",
2000.0,
50.0,
35000.0,
8,
dgdr,
)
kwargs = mock_pick.call_args.kwargs
assert kwargs["target_request_latency"] == 35000.0
assert kwargs["total_gpus"] == 8
assert kwargs["serving_mode"] == "disagg"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_default_falls_back_to_target_tpot(self):
"""default mode passes target_tpot when no request_latency."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr()
with patch(
"dynamo.profiler.thorough.pick_default", return_value=_mock_result()
) as mock_pick:
_pick_thorough_best_config(
prefill_df,
decode_df,
"default",
2000.0,
50.0,
None,
8,
dgdr,
)
kwargs = mock_pick.call_args.kwargs
assert kwargs["target_tpot"] == 50.0
assert "target_request_latency" not in kwargs
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_load_match_omits_workload_kwargs_when_no_workload(self):
"""When dgdr.workload has no rate/concurrency, those kwargs are absent."""
prefill_df, decode_df = _stub_dfs()
dgdr = _make_dgdr() # no requestRate or concurrency
with patch(
"dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
) as mock_pick:
_pick_thorough_best_config(
prefill_df,
decode_df,
"load_match",
2000.0,
50.0,
None,
0,
dgdr,
)
kwargs = mock_pick.call_args.kwargs
assert "target_request_rate" not in kwargs
assert "max_total_gpus" not in kwargs
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Test suite for profile_sla aiconfigurator functionality.
profile_sla should be able to use aiconfigurator functionality
even without access to any GPU system.
"""
import sys
from pathlib import Path
import pytest
# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.profiler.profile_sla import run_profile # noqa: E402
from dynamo.profiler.utils.defaults import SearchStrategy # noqa: E402
from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402
except ImportError as _e:
pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
pytestmark = [
pytest.mark.aiconfigurator,
]
# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
"""Override the logger fixture to prevent test directory creation.
This replaces the logger fixture from tests/conftest.py that creates
directories named after each test.
"""
# Simply do nothing - no directories created, no file handlers added
yield
class TestProfileSlaAiconfigurator:
"""Test class for profile_sla aiconfigurator functionality."""
@pytest.fixture
def llm_args(self, request):
class Args:
def __init__(self):
self.model = "Qwen/Qwen3-32B" # Set to match aic_hf_id for consistency
self.dgd_image = ""
self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml"
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.min_num_gpus_per_engine = 1
self.max_num_gpus_per_engine = 8
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = False
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
# Use RAPID strategy to leverage AI Configurator for perf estimation
# This avoids Kubernetes deployments while testing aiconfigurator functionality
self.search_strategy = SearchStrategy.RAPID
self.system = "h200_sxm" # Must match aic_system for RAPID strategy
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=16384,
)
return Args()
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.performance
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.parametrize("missing_arg", ["system", "model"])
async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
# Check that validation error happens when a required arg is missing for RAPID strategy.
# These args are required when using SearchStrategy.RAPID with AI Configurator.
setattr(llm_args, missing_arg, None)
with pytest.raises(ValueError):
await run_profile(llm_args)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.performance
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.parametrize(
"arg_name, bad_value",
[
# these values don't exist in the aiconfigurator database.
("system", "fake_gpu_system"),
],
)
async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
# Check that an appropriate error is raised when the system/model/backend
# is not found in the aiconfigurator database.
setattr(llm_args, arg_name, bad_value)
with pytest.raises(ValueError, match="Database not found"):
await run_profile(llm_args)
@pytest.mark.trtllm
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
async def test_trtllm_aiconfigurator_single_model(self, llm_args):
# Test that profile_sla works with the model & backend in the llm_args fixture.
await run_profile(llm_args)
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_1
@pytest.mark.integration
@pytest.mark.nightly
# fmt: off
@pytest.mark.parametrize(
"backend",
[
pytest.param("trtllm", marks=pytest.mark.trtllm),
pytest.param("vllm", marks=pytest.mark.vllm),
pytest.param("sglang", marks=pytest.mark.sglang),
],
)
# fmt: on
@pytest.mark.parametrize(
"hf_model_id",
[
"Qwen/Qwen3-32B",
"meta-llama/Llama-3.1-405B",
],
)
async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
# Test that profile_sla works with a variety of backends and model names
# using AI Configurator's RAPID strategy for performance estimation.
# Backend version is not used with RAPID strategy - performance comes from AI Configurator.
llm_args.model = hf_model_id # Used by RAPID strategy
llm_args.backend = backend
await run_profile(llm_args)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Test suite for profile_sla with DynamoGraphDeploymentRequestSpec input.
Tests the new DGDR-based profiler entry point across different configurations:
rapid/thorough, supported/unsupported, planner/no-planner, load-match, PVC, mocker.
All tests are no-GPU (gpu_0) and pre_merge.
"""
import asyncio
import os
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import numpy as np
import pytest
import yaml
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.profiler.profile_sla import run_profile
from dynamo.profiler.utils.dgdr_v1beta1_types import (
BackendType,
DynamoGraphDeploymentRequestSpec,
SearchStrategy,
)
from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
except ImportError as _e:
pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
@pytest.fixture(autouse=True)
def logger(request):
"""Override the logger fixture to prevent test directory creation."""
yield
def _load_dgdr(yaml_path) -> DynamoGraphDeploymentRequestSpec:
with open(yaml_path) as f:
data = yaml.safe_load(f)
return DynamoGraphDeploymentRequestSpec.model_validate(data)
def _make_ops(tmp_path, **overrides) -> ProfilerOperationalConfig:
defaults = {
"output_dir": str(tmp_path / "profiling_results"),
"dry_run": False,
}
defaults.update(overrides)
return ProfilerOperationalConfig(**defaults)
CONFIGS_DIR = Path(__file__).parent / "configs"
class TestRapidSupported:
"""Rapid strategy with AIC-supported model (Qwen3-32B on h200_sxm/trtllm)."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_no_load(self, tmp_path):
"""Case 1: default picking mode, no planner, no target load."""
dgdr = _load_dgdr(CONFIGS_DIR / "1_rapid_supported_no_planner_no_load.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config, "final_config.yaml should not be empty"
assert "spec" in config
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_with_load(self, tmp_path):
"""Case 2: load-match picking mode with requestRate."""
dgdr = _load_dgdr(CONFIGS_DIR / "2_rapid_supported_no_planner_with_load.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_pvc_no_planner_with_load(self, tmp_path):
"""Case 2b: load-match with PVC model cache."""
dgdr = _load_dgdr(
CONFIGS_DIR / "2b_rapid_supported_pvc_no_planner_with_load.yaml"
)
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config
spec = config.get("spec", {})
pvcs = spec.get("pvcs", [])
assert any(
p.get("name") == "model-cache" for p in pvcs
), "PVC should be mounted"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_e2e_latency_sla(self, tmp_path):
"""Case 2c: e2eLatency SLA instead of ttft/itl."""
dgdr = _load_dgdr(CONFIGS_DIR / "2c_rapid_supported_e2e_latency.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config
# Verify ttft/itl were cleared by the validator
assert dgdr.sla.ttft is None
assert dgdr.sla.itl is None
assert dgdr.sla.e2eLatency == 35000.0
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_both_concurrency_and_rate_rejected(self, tmp_path):
"""Case 2d: both concurrency and requestRate should fail profiler validation."""
dgdr = _load_dgdr(CONFIGS_DIR / "2d_rapid_both_concurrency_and_rate_error.yaml")
ops = _make_ops(tmp_path)
with pytest.raises(ValueError, match="concurrency.*requestRate"):
asyncio.run(run_profile(dgdr, ops))
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_rapid_sweep(self, tmp_path):
"""Case 3: autoscale picking with planner + rapid pre-deployment sweep."""
dgdr = _load_dgdr(CONFIGS_DIR / "3_rapid_supported_planner_rapid_sweep.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
raw = output.read_text()
docs = list(yaml.safe_load_all(raw))
assert len(docs) >= 2, "Planner config should produce multi-doc YAML"
dgd = docs[-1]
assert "Planner" in dgd.get("spec", {}).get(
"services", {}
), "Planner service should be added"
class TestRapidUnsupported:
"""Rapid strategy with AIC-unsupported model (Qwen3-32B on l40s/vllm)."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_naive_fallback(self, tmp_path):
"""Case 4: falls back to naive config generation."""
dgdr = _load_dgdr(CONFIGS_DIR / "4_rapid_unsupported_no_planner.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config, "Naive fallback should produce a non-empty config"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_load_scaling_rapid_sweep_fallback(self, tmp_path):
"""Case 5: planner with load scaling, rapid sweep falls back to none."""
dgdr = _load_dgdr(CONFIGS_DIR / "5_rapid_unsupported_planner.yaml")
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_throughput_scaling_raises(self, tmp_path):
"""Case 5b: planner with throughput scaling on unsupported combo should fail."""
dgdr = _load_dgdr(
CONFIGS_DIR / "5b_rapid_unsupported_planner_throughput_error.yaml"
)
ops = _make_ops(tmp_path)
with pytest.raises(
ValueError, match="Throughput-based planner scaling requires AIC support"
):
asyncio.run(run_profile(dgdr, ops))
class TestThoroughDryRun:
"""Thorough strategy tested with --dry-run (no real deployments)."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_with_load(self, tmp_path):
"""Case 6: thorough + load-match, dry-run."""
dgdr = _load_dgdr(CONFIGS_DIR / "6_thorough_no_planner_with_load.yaml")
ops = _make_ops(tmp_path, dry_run=True)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_planner_rapid_sweep(self, tmp_path):
"""Case 7: thorough + planner + rapid pre-deployment sweep, dry-run."""
dgdr = _load_dgdr(CONFIGS_DIR / "7_thorough_planner_rapid_sweep.yaml")
ops = _make_ops(tmp_path, dry_run=True)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
class TestMockerEnabled:
"""Mocker feature flag selects mocker DGD over real worker DGD."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_config_selected(self, tmp_path):
"""Case 3b: planner + mocker enabled, should produce mocker DGD."""
config_path = CONFIGS_DIR / "3b_rapid_supported_planner_rapid_sweep_mocker.yaml"
if not config_path.exists():
pytest.skip("3b mocker config not found")
dgdr = _load_dgdr(config_path)
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
class TestGateChecks:
"""Validate gate checks at profiler startup."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_auto_backend_rejected(self, tmp_path):
"""Thorough + auto backend should raise ValueError."""
dgdr = _load_dgdr(CONFIGS_DIR / "1_rapid_supported_no_planner_no_load.yaml")
dgdr.searchStrategy = SearchStrategy.Thorough
dgdr.backend = BackendType.Auto
ops = _make_ops(tmp_path)
with pytest.raises(ValueError, match="does not support 'auto' backend"):
asyncio.run(run_profile(dgdr, ops))
class TestAutoBackend:
"""Rapid strategy with auto backend resolution."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_no_planner_no_load(self, tmp_path):
"""Case 11: auto backend, rapid, no planner, no target load."""
dgdr = _load_dgdr(CONFIGS_DIR / "11_auto_rapid_no_planner_no_load.yaml")
assert dgdr.backend == BackendType.Auto
ops = _make_ops(tmp_path)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config, "final_config.yaml should not be empty"
assert "spec" in config
class TestThoroughEdgeCases:
"""Edge cases for thorough mode."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_empty_candidates_due_to_small_gpu(self, tmp_path):
"""Case 8: DeepSeek-R1 on 1 L40S GPU — model too large, no candidates."""
dgdr = _load_dgdr(CONFIGS_DIR / "8_thorough_empty_candidates.yaml")
ops = _make_ops(tmp_path, dry_run=True)
asyncio.run(run_profile(dgdr, ops))
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
status_file = tmp_path / "profiling_results" / "profiler_status.yaml"
if status_file.exists():
status = yaml.safe_load(status_file.read_text())
assert status.get("status") in ("success", "failed")
# ---------------------------------------------------------------------------
# Helpers for mocking K8s deployment + benchmark functions
# ---------------------------------------------------------------------------
def _mock_deployment_client():
"""Create a mock DynamoDeploymentClient that returns immediately."""
client = MagicMock()
client.create_deployment = AsyncMock()
client.wait_for_deployment_ready = AsyncMock()
client.get_deployment_logs = AsyncMock()
client.delete_deployment = AsyncMock()
client.get_service_url = MagicMock(return_value="http://mock:8000")
client.deployment_name = "mock-deployment"
client.base_log_dir = "/tmp"
return client
def _save_dummy_npz(output_dir: str):
"""Save dummy prefill + decode NPZ files matching the interpolation format."""
prefill_dir = os.path.join(output_dir, "selected_prefill_interpolation")
os.makedirs(prefill_dir, exist_ok=True)
np.savez(
os.path.join(prefill_dir, "raw_data.npz"),
prefill_isl=np.array([500, 1000, 2000, 4000]),
prefill_ttft=np.array([10.0, 20.0, 40.0, 80.0]),
prefill_thpt_per_gpu=np.array([50000.0, 50000.0, 50000.0, 50000.0]),
)
decode_dir = os.path.join(output_dir, "selected_decode_interpolation")
os.makedirs(decode_dir, exist_ok=True)
np.savez(
os.path.join(decode_dir, "raw_data.npz"),
x_kv_usage=np.array([0.1, 0.3, 0.5, 0.8]),
y_context_length=np.array([500, 1000, 2000, 4000]),
z_itl=np.array([5.0, 6.0, 7.0, 8.0]),
z_thpt_per_gpu=np.array([200.0, 180.0, 160.0, 140.0]),
max_kv_tokens=np.array([100000]),
)
_DECODE_SVC_NAMES = {
"sglang": "decode",
"vllm": "VllmDecodeWorker",
"trtllm": "TRTLLMDecodeWorker",
}
def _make_thorough_patches(backend: str = "trtllm"):
"""Build mock-patches for thorough mode, parameterised by backend."""
svc_name = _DECODE_SVC_NAMES.get(backend, "TRTLLMDecodeWorker")
return [
patch(
"dynamo.profiler.thorough.DynamoDeploymentClient",
side_effect=lambda **kw: _mock_deployment_client(),
),
patch("dynamo.profiler.thorough.get_prefill_ttft", return_value=50.0),
patch(
"dynamo.profiler.thorough.get_decode_itl_and_thpt_per_gpu",
return_value=(8.0, 125.0),
),
patch("dynamo.profiler.thorough.get_num_request_range", return_value=[1, 4, 8]),
patch(
"dynamo.profiler.thorough.get_service_name_by_type",
return_value=svc_name,
),
]
# Backward compat: existing tests use the trtllm-flavored list
_THOROUGH_PATCHES = _make_thorough_patches("trtllm")
def _patch_kv_cache_log(backend: str = "trtllm"):
"""Patch get_kv_cache_size_from_dynamo_log on the real config modifier."""
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
real_modifier = CONFIG_MODIFIERS[backend]
return patch.object(
real_modifier, "get_kv_cache_size_from_dynamo_log", return_value=100000
)
class TestThoroughMocked:
"""Thorough mode with mocked K8s deployments and benchmark functions.
Only K8s client, AIPerf benchmarks, and log-file reads are mocked.
Enumeration, picking, and DGD generation run for real via AIC.
"""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_no_planner_with_load(self, tmp_path):
"""Case 6 (mocked): thorough + load-match, full pipeline without real GPUs."""
dgdr = _load_dgdr(CONFIGS_DIR / "6_thorough_no_planner_with_load.yaml")
ops = _make_ops(tmp_path)
with _patch_kv_cache_log("trtllm"):
for p in _THOROUGH_PATCHES:
p.start()
try:
asyncio.run(run_profile(dgdr, ops))
finally:
for p in _THOROUGH_PATCHES:
p.stop()
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
config = yaml.safe_load(output.read_text())
assert config, "Mocked thorough should produce a non-empty config"
assert "spec" in config
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_thorough_planner_thorough_sweep(self, tmp_path):
"""Case 7b: thorough search + thorough interpolation, fully mocked."""
dgdr = _load_dgdr(CONFIGS_DIR / "7b_thorough_planner_thorough_sweep.yaml")
ops = _make_ops(tmp_path)
def mock_profile_prefill(work_dir, *args, **kwargs):
_save_dummy_npz(ops.output_dir)
def mock_profile_decode(work_dir, *args, **kwargs):
_save_dummy_npz(ops.output_dir)
interp_patches = [
patch(
"dynamo.profiler.interpolation.DynamoDeploymentClient",
side_effect=lambda **kw: _mock_deployment_client(),
),
patch(
"dynamo.profiler.interpolation.profile_prefill",
side_effect=mock_profile_prefill,
),
patch(
"dynamo.profiler.interpolation.profile_decode",
side_effect=mock_profile_decode,
),
patch(
"dynamo.profiler.interpolation.get_service_name_by_type",
return_value="TRTLLMWorker",
),
]
with _patch_kv_cache_log("trtllm"):
all_patches = _THOROUGH_PATCHES + interp_patches
for p in all_patches:
p.start()
try:
asyncio.run(run_profile(dgdr, ops))
finally:
for p in all_patches:
p.stop()
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists()
raw = output.read_text()
docs = list(yaml.safe_load_all(raw))
assert len(docs) >= 2, "Planner + profiling data should produce multi-doc YAML"
prefill_npz = (
tmp_path
/ "profiling_results"
/ "selected_prefill_interpolation"
/ "raw_data.npz"
)
decode_npz = (
tmp_path
/ "profiling_results"
/ "selected_decode_interpolation"
/ "raw_data.npz"
)
assert prefill_npz.exists(), "Prefill interpolation data should be saved"
assert decode_npz.exists(), "Decode interpolation data should be saved"
# ---------------------------------------------------------------------------
# Shared helper for mocked-thorough + override tests
# ---------------------------------------------------------------------------
def _run_mocked_thorough(dgdr, ops, backend: str):
"""Run the full mocked-thorough pipeline for an arbitrary backend."""
thorough_patches = _make_thorough_patches(backend)
kv_patch = _patch_kv_cache_log(backend)
with kv_patch:
for p in thorough_patches:
p.start()
try:
asyncio.run(run_profile(dgdr, ops))
finally:
for p in thorough_patches:
p.stop()
def _assert_overrides_applied(final_config_path: Path, dgdr):
"""Assert the final DGD exists and that overrides are reflected."""
assert final_config_path.exists(), "final_config.yaml should exist"
raw = final_config_path.read_text()
docs = list(yaml.safe_load_all(raw))
dgd = docs[-1] if docs else {}
assert dgd and "spec" in dgd, "DGD should have a spec"
override_spec = dgdr.overrides.dgd.get("spec", {})
for ovr_key in ("envs", "backendFramework"):
if ovr_key in override_spec:
assert ovr_key in dgd["spec"], f"Override field spec.{ovr_key} should exist"
svc_overrides = override_spec.get("services", {})
dgd_services = dgd.get("spec", {}).get("services", {})
for svc_name, svc_ovr in svc_overrides.items():
if svc_name in dgd_services:
dgd_svc = dgd_services[svc_name]
if "sharedMemory" in svc_ovr:
assert (
"sharedMemory" in dgd_svc
), f"Override sharedMemory on {svc_name} should be applied"
mc = svc_ovr.get("extraPodSpec", {}).get("mainContainer", {})
if "args" in mc:
dgd_args = (
dgd_svc.get("extraPodSpec", {})
.get("mainContainer", {})
.get("args", [])
)
for arg in mc["args"]:
assert (
arg in dgd_args
), f"Override arg '{arg}' should be in {svc_name} args"
class TestThoroughMockedOverrides:
"""Thorough + DGD overrides: verify overrides are applied end-to-end."""
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_9a_sglang_overrides(self, tmp_path):
"""Case 9a: SGLang thorough sweep with DSR1 overrides."""
dgdr = _load_dgdr(CONFIGS_DIR / "9a_thorough_dsr1_sglang_overrides.yaml")
ops = _make_ops(tmp_path)
_run_mocked_thorough(dgdr, ops, "sglang")
_assert_overrides_applied(
tmp_path / "profiling_results" / "final_config.yaml",
dgdr,
)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_10_override_security_context(self, tmp_path):
"""Case 10: imagePullSecrets injected via overrides into a new spec field."""
dgdr = _load_dgdr(CONFIGS_DIR / "10_thorough_override_security_context.yaml")
ops = _make_ops(tmp_path)
_run_mocked_thorough(dgdr, ops, "trtllm")
output = tmp_path / "profiling_results" / "final_config.yaml"
assert output.exists(), "final_config.yaml should exist"
config = yaml.safe_load(output.read_text())
assert config and "spec" in config
secrets = config["spec"].get("imagePullSecrets")
assert secrets is not None, "imagePullSecrets should be present"
secret_names = [s["name"] for s in secrets]
assert "my-registry-secret" in secret_names
assert "nvcr-pull-secret" in secret_names
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Test suite for profile_sla dry-run functionality.
This test ensures that the profile_sla script can successfully run in dry-run mode
for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
"""
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
from dynamo.profiler.profile_sla import run_profile # noqa: E402
from dynamo.profiler.utils.defaults import SearchStrategy # noqa: E402
from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402
from dynamo.profiler.utils.search_space_autogen import ( # noqa: E402
auto_generate_search_space,
)
except ImportError as _e:
pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
"""Override the logger fixture to prevent test directory creation.
This replaces the logger fixture from tests/conftest.py that creates
directories named after each test.
"""
# Simply do nothing - no directories created, no file handlers added
yield
class TestProfileSLADryRun:
"""Test class for profile_sla dry-run functionality."""
@pytest.fixture
def vllm_args(self, request):
"""Create arguments for vllm backend dry-run test."""
class Args:
def __init__(self):
self.backend = "vllm"
self.config = "examples/backends/vllm/deploy/disagg.yaml"
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = ""
self.dgd_image = ""
self.min_num_gpus_per_engine = 1
self.max_num_gpus_per_engine = 8
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args()
@pytest.fixture
def sglang_args(self, request):
"""Create arguments for sglang backend dry-run test."""
class Args:
def __init__(self):
self.backend = "sglang"
self.config = "examples/backends/sglang/deploy/disagg.yaml"
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = ""
self.dgd_image = ""
self.min_num_gpus_per_engine = 1
self.max_num_gpus_per_engine = 8
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.vllm
async def test_vllm_dryrun(self, vllm_args):
"""Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
# Run the profile in dry-run mode - should complete without errors
await run_profile(vllm_args)
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.sglang
async def test_sglang_dryrun(self, sglang_args):
"""Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
# Run the profile in dry-run mode - should complete without errors
await run_profile(sglang_args)
@pytest.fixture
def trtllm_args(self, request):
"""Create arguments for trtllm backend dry-run test."""
class Args:
def __init__(self):
self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml"
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = ""
self.dgd_image = ""
self.min_num_gpus_per_engine = 1
self.max_num_gpus_per_engine = 8
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.trtllm
async def test_trtllm_dryrun(self, trtllm_args):
"""Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
# Run the profile in dry-run mode - should complete without errors
await run_profile(trtllm_args)
@pytest.fixture
def sglang_moe_args(self, request):
"""Create arguments for trtllm backend dry-run test."""
class Args:
def __init__(self):
self.backend = "sglang"
self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = ""
self.dgd_image = ""
self.min_num_gpus_per_engine = 8
self.max_num_gpus_per_engine = 32
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
# Added in newer profiler versions; keep Args compatible with search_space_autogen
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
self.model_info = ModelInfo(
model_size=65536.0,
architecture="TestMoEArchitecture",
is_moe=True,
max_context_length=self.max_context_length,
num_experts=16,
)
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.sglang
async def test_sglang_moe_dryrun(self, sglang_moe_args):
"""Test that profile_sla dry-run works for sglang backend with MoE config."""
# Run the profile in dry-run mode - should complete without errors
await run_profile(sglang_moe_args)
# Example tests with mocked GPU inventory
@pytest.fixture
def mock_h100_gpu_info(self):
"""Mock GPU info for H100 80GB cluster."""
return {
"gpus_per_node": 8,
"model": "h100_sxm",
"vram": 81920, # 80GB in MiB
}
@pytest.fixture
def mock_model_info(self):
"""Mock model info for DeepSeek-R1-Distill-Llama-8B."""
return ModelInfo(
model_size=16384.0, # 16GB model in MiB
architecture="LlamaForCausalLM",
is_moe=False,
max_context_length=16384,
)
@pytest.fixture
def vllm_args_with_model_autogen(self, request):
"""Create arguments for vllm backend with model-based search space autogeneration."""
class Args:
def __init__(self):
self.backend = "vllm"
self.config = ""
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
# Set to 0 to trigger auto-generation path
self.min_num_gpus_per_engine = 0
self.max_num_gpus_per_engine = 0
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 0
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.integration
@pytest.mark.gpu_0
@pytest.mark.vllm
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
vllm_args_with_model_autogen,
mock_model_info,
):
"""Test profile_sla with auto-generated search space on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory. GPU info is provided via command-line
arguments injected by the Operator into the profiling config (DYN-2135).
"""
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
# Run the profile - the search space will be auto-generated
# based on the model and GPU info from args
auto_generate_search_space(vllm_args_with_model_autogen)
await run_profile(vllm_args_with_model_autogen)
@pytest.fixture
def sglang_args_with_model_autogen(self, request):
"""Create arguments for sglang backend with model-based search space autogeneration."""
class Args:
def __init__(self):
self.backend = "sglang"
self.config = ""
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
self.min_num_gpus_per_engine = 0
self.max_num_gpus_per_engine = 0
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 0
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.sglang
@pytest.mark.skip(
reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
"is in legacy format missing 'gemm_dtype' field. "
"See: KeyError in aiconfigurator/sdk/perf_database.py"
)
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_sglang_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
sglang_args_with_model_autogen,
mock_model_info,
):
"""Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory for sglang backend. GPU info is provided via
command-line arguments injected by the Operator into the profiling config (DYN-2135).
NOTE: Currently skipped due to AI Configurator database format issue.
The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
the required 'gemm_dtype' field, causing KeyError during database loading.
"""
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
# Run the profile - the search space will be auto-generated
# based on the model and GPU info from args
auto_generate_search_space(sglang_args_with_model_autogen)
await run_profile(sglang_args_with_model_autogen)
@pytest.fixture
def trtllm_args_with_model_autogen(self, request):
"""Create arguments for trtllm backend with model-based search space autogeneration."""
class Args:
def __init__(self):
self.backend = "trtllm"
self.config = ""
# Use unique output directory per test for parallel execution
self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
self.min_num_gpus_per_engine = 0
self.max_num_gpus_per_engine = 0
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 0
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
return Args()
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.trtllm
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_trtllm_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
trtllm_args_with_model_autogen,
mock_model_info,
):
"""Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory for trtllm backend. GPU info is provided via
command-line arguments injected by the Operator into the profiling config (DYN-2135).
"""
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
# Run the profile - the search space will be auto-generated
# based on the model and GPU info from args
auto_generate_search_space(trtllm_args_with_model_autogen)
await run_profile(trtllm_args_with_model_autogen)
# Unit tests for search_strategy and system attributes
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_vllm_args_has_search_strategy(self, vllm_args):
"""Test that vllm_args fixture has search_strategy attribute."""
assert hasattr(vllm_args, "search_strategy")
assert vllm_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(vllm_args, "system")
assert vllm_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_sglang_args_has_search_strategy(self, sglang_args):
"""Test that sglang_args fixture has search_strategy attribute."""
assert hasattr(sglang_args, "search_strategy")
assert sglang_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(sglang_args, "system")
assert sglang_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_trtllm_args_has_search_strategy(self, trtllm_args):
"""Test that trtllm_args fixture has search_strategy attribute."""
assert hasattr(trtllm_args, "search_strategy")
assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(trtllm_args, "system")
assert trtllm_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
"""Test that sglang_moe_args fixture has search_strategy attribute."""
assert hasattr(sglang_moe_args, "search_strategy")
assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(sglang_moe_args, "system")
assert sglang_moe_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_model_autogen_args_have_rapid_strategy(
self,
vllm_args_with_model_autogen,
sglang_args_with_model_autogen,
trtllm_args_with_model_autogen,
):
"""Test that model autogen fixtures have RAPID search strategy and GPU info."""
for args_fixture in [
vllm_args_with_model_autogen,
sglang_args_with_model_autogen,
trtllm_args_with_model_autogen,
]:
assert hasattr(args_fixture, "search_strategy")
assert args_fixture.search_strategy == SearchStrategy.RAPID
assert hasattr(args_fixture, "system")
assert args_fixture.system == "h100_sxm"
# Verify GPU discovery attributes
assert hasattr(args_fixture, "num_gpus_per_node")
assert args_fixture.num_gpus_per_node == 8
assert hasattr(args_fixture, "gpu_model")
assert args_fixture.gpu_model == "H100-SXM5-80GB"
assert hasattr(args_fixture, "gpu_vram_mib")
assert args_fixture.gpu_vram_mib == 81920
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment