Unverified Commit 14d928cb authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: only add profiling data to mocker workers (#7164)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent e4865b69
......@@ -107,7 +107,7 @@ def assemble_final_config(
if profile:
output_dir = ops.output_dir if not ops.dry_run else None
profile_cm = add_profile_data_to_config(base, output_dir)
profile_cm = add_profile_data_to_config(base, output_dir, mocker_enabled=mocker)
if profile_cm:
config_maps.append(profile_cm)
......@@ -231,17 +231,21 @@ def add_planner_to_config(
def add_profile_data_to_config(
config_dict: dict,
output_dir: str | None,
mocker_enabled: bool = False,
) -> Optional[dict]:
"""Create a profile-data ConfigMap and mount it into consumers in *config_dict*.
Consumers are auto-detected:
- The **Planner** service (if present) gets the volume mounted.
- **Mocker workers** (if present) get the volume mounted and
- **Mocker workers** (when *mocker_enabled*) get the volume mounted and
``--planner-profile-data`` set.
Args:
config_dict: The DGD config dict — mutated in place.
output_dir: Directory containing profiling interpolation NPZ files.
mocker_enabled: Only inject ``--planner-profile-data`` into workers
when the mocker backend is active. Non-mocker backends (vllm,
sglang, trtllm) do not recognise this argument.
Returns:
The ``profile_data_cm`` ConfigMap dict, or ``None`` if no profiling
......@@ -274,20 +278,25 @@ def add_profile_data_to_config(
planner_svc, profile_data_cm_name, PROFILE_DATA_MOUNT
)
# Mount into mocker workers if they exist
services = config_dict.get("spec", {}).get("services", {})
for worker_name in _mocker_worker_names():
worker_svc = services.get(worker_name)
if worker_svc is not None:
main_container = worker_svc.get("extraPodSpec", {}).get("mainContainer", {})
args_list = main_container.get("args", [])
args_list = set_argument_value(
args_list, "--planner-profile-data", PROFILE_DATA_MOUNT
)
main_container["args"] = args_list
_mount_volume_into_service(
worker_svc, profile_data_cm_name, PROFILE_DATA_MOUNT
)
# Mount into mocker workers only when the mocker backend is active.
# Non-mocker backends (vllm, sglang, trtllm) share the same service
# names ("prefill", "decode") but do not accept --planner-profile-data.
if mocker_enabled:
services = config_dict.get("spec", {}).get("services", {})
for worker_name in _mocker_worker_names():
worker_svc = services.get(worker_name)
if worker_svc is not None:
main_container = worker_svc.get("extraPodSpec", {}).get(
"mainContainer", {}
)
args_list = main_container.get("args", [])
args_list = set_argument_value(
args_list, "--planner-profile-data", PROFILE_DATA_MOUNT
)
main_container["args"] = args_list
_mount_volume_into_service(
worker_svc, profile_data_cm_name, PROFILE_DATA_MOUNT
)
return profile_data_cm
......
......@@ -32,7 +32,10 @@ try:
PickedParallelConfig,
)
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.dgd_generation import assemble_final_config
from dynamo.profiler.utils.dgd_generation import (
add_profile_data_to_config,
assemble_final_config,
)
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
FeaturesSpec,
......@@ -583,6 +586,90 @@ class TestAssembleFinalConfig:
assert result == [profile_cm, mocker_base]
# ---------------------------------------------------------------------------
# add_profile_data_to_config — mocker_enabled guard (DYN-2409)
# ---------------------------------------------------------------------------
class TestAddProfileDataMockerGuard:
"""Verify --planner-profile-data is only injected for mocker workers."""
@staticmethod
def _sglang_dgd():
"""Minimal DGD with sglang-style 'prefill' and 'decode' workers."""
return {
"spec": {
"services": {
"Planner": {
"extraPodSpec": {
"mainContainer": {"args": ["--config", "{}"]},
}
},
"prefill": {
"extraPodSpec": {
"mainContainer": {
"args": [
"-m",
"dynamo.sglang",
"--model-path",
"Qwen/Qwen3-32B",
"--disaggregation-mode",
"prefill",
]
}
}
},
"decode": {
"extraPodSpec": {
"mainContainer": {
"args": [
"-m",
"dynamo.sglang",
"--model-path",
"Qwen/Qwen3-32B",
"--disaggregation-mode",
"decode",
]
}
}
},
}
}
}
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_disabled_no_planner_profile_data_in_workers(self, tmp_path):
"""When mocker is disabled, workers must NOT receive --planner-profile-data."""
dgd = self._sglang_dgd()
with patch(f"{_DGD_GEN}._load_profiling_data", return_value={"prefill": {}}):
add_profile_data_to_config(dgd, str(tmp_path), mocker_enabled=False)
for name in ("prefill", "decode"):
args = dgd["spec"]["services"][name]["extraPodSpec"]["mainContainer"][
"args"
]
assert (
"--planner-profile-data" not in args
), f"sglang worker '{name}' should not have --planner-profile-data"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_enabled_injects_planner_profile_data(self, tmp_path):
"""When mocker is enabled, mocker workers MUST receive --planner-profile-data."""
dgd = self._sglang_dgd()
with patch(f"{_DGD_GEN}._load_profiling_data", return_value={"prefill": {}}):
add_profile_data_to_config(dgd, str(tmp_path), mocker_enabled=True)
for name in ("prefill", "decode"):
args = dgd["spec"]["services"][name]["extraPodSpec"]["mainContainer"][
"args"
]
assert (
"--planner-profile-data" in args
), f"mocker worker '{name}' should have --planner-profile-data"
# ---------------------------------------------------------------------------
# Regression tests: naive fallback resolved_backend propagation (bug fix)
# ---------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment