Unverified Commit 14d928cb authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: only add profiling data to mocker workers (#7164)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent e4865b69
...@@ -107,7 +107,7 @@ def assemble_final_config( ...@@ -107,7 +107,7 @@ def assemble_final_config(
if profile: if profile:
output_dir = ops.output_dir if not ops.dry_run else None output_dir = ops.output_dir if not ops.dry_run else None
profile_cm = add_profile_data_to_config(base, output_dir) profile_cm = add_profile_data_to_config(base, output_dir, mocker_enabled=mocker)
if profile_cm: if profile_cm:
config_maps.append(profile_cm) config_maps.append(profile_cm)
...@@ -231,17 +231,21 @@ def add_planner_to_config( ...@@ -231,17 +231,21 @@ def add_planner_to_config(
def add_profile_data_to_config( def add_profile_data_to_config(
config_dict: dict, config_dict: dict,
output_dir: str | None, output_dir: str | None,
mocker_enabled: bool = False,
) -> Optional[dict]: ) -> Optional[dict]:
"""Create a profile-data ConfigMap and mount it into consumers in *config_dict*. """Create a profile-data ConfigMap and mount it into consumers in *config_dict*.
Consumers are auto-detected: Consumers are auto-detected:
- The **Planner** service (if present) gets the volume mounted. - The **Planner** service (if present) gets the volume mounted.
- **Mocker workers** (if present) get the volume mounted and - **Mocker workers** (when *mocker_enabled*) get the volume mounted and
``--planner-profile-data`` set. ``--planner-profile-data`` set.
Args: Args:
config_dict: The DGD config dict — mutated in place. config_dict: The DGD config dict — mutated in place.
output_dir: Directory containing profiling interpolation NPZ files. output_dir: Directory containing profiling interpolation NPZ files.
mocker_enabled: Only inject ``--planner-profile-data`` into workers
when the mocker backend is active. Non-mocker backends (vllm,
sglang, trtllm) do not recognise this argument.
Returns: Returns:
The ``profile_data_cm`` ConfigMap dict, or ``None`` if no profiling The ``profile_data_cm`` ConfigMap dict, or ``None`` if no profiling
...@@ -274,20 +278,25 @@ def add_profile_data_to_config( ...@@ -274,20 +278,25 @@ def add_profile_data_to_config(
planner_svc, profile_data_cm_name, PROFILE_DATA_MOUNT planner_svc, profile_data_cm_name, PROFILE_DATA_MOUNT
) )
# Mount into mocker workers if they exist # Mount into mocker workers only when the mocker backend is active.
services = config_dict.get("spec", {}).get("services", {}) # Non-mocker backends (vllm, sglang, trtllm) share the same service
for worker_name in _mocker_worker_names(): # names ("prefill", "decode") but do not accept --planner-profile-data.
worker_svc = services.get(worker_name) if mocker_enabled:
if worker_svc is not None: services = config_dict.get("spec", {}).get("services", {})
main_container = worker_svc.get("extraPodSpec", {}).get("mainContainer", {}) for worker_name in _mocker_worker_names():
args_list = main_container.get("args", []) worker_svc = services.get(worker_name)
args_list = set_argument_value( if worker_svc is not None:
args_list, "--planner-profile-data", PROFILE_DATA_MOUNT main_container = worker_svc.get("extraPodSpec", {}).get(
) "mainContainer", {}
main_container["args"] = args_list )
_mount_volume_into_service( args_list = main_container.get("args", [])
worker_svc, profile_data_cm_name, PROFILE_DATA_MOUNT args_list = set_argument_value(
) args_list, "--planner-profile-data", PROFILE_DATA_MOUNT
)
main_container["args"] = args_list
_mount_volume_into_service(
worker_svc, profile_data_cm_name, PROFILE_DATA_MOUNT
)
return profile_data_cm return profile_data_cm
......
...@@ -32,7 +32,10 @@ try: ...@@ -32,7 +32,10 @@ try:
PickedParallelConfig, PickedParallelConfig,
) )
from dynamo.profiler.utils.defaults import SearchStrategy from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.dgd_generation import assemble_final_config from dynamo.profiler.utils.dgd_generation import (
add_profile_data_to_config,
assemble_final_config,
)
from dynamo.profiler.utils.dgdr_v1beta1_types import ( from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec, DynamoGraphDeploymentRequestSpec,
FeaturesSpec, FeaturesSpec,
...@@ -583,6 +586,90 @@ class TestAssembleFinalConfig: ...@@ -583,6 +586,90 @@ class TestAssembleFinalConfig:
assert result == [profile_cm, mocker_base] assert result == [profile_cm, mocker_base]
# ---------------------------------------------------------------------------
# add_profile_data_to_config — mocker_enabled guard (DYN-2409)
# ---------------------------------------------------------------------------
class TestAddProfileDataMockerGuard:
"""Verify --planner-profile-data is only injected for mocker workers."""
@staticmethod
def _sglang_dgd():
"""Minimal DGD with sglang-style 'prefill' and 'decode' workers."""
return {
"spec": {
"services": {
"Planner": {
"extraPodSpec": {
"mainContainer": {"args": ["--config", "{}"]},
}
},
"prefill": {
"extraPodSpec": {
"mainContainer": {
"args": [
"-m",
"dynamo.sglang",
"--model-path",
"Qwen/Qwen3-32B",
"--disaggregation-mode",
"prefill",
]
}
}
},
"decode": {
"extraPodSpec": {
"mainContainer": {
"args": [
"-m",
"dynamo.sglang",
"--model-path",
"Qwen/Qwen3-32B",
"--disaggregation-mode",
"decode",
]
}
}
},
}
}
}
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_disabled_no_planner_profile_data_in_workers(self, tmp_path):
"""When mocker is disabled, workers must NOT receive --planner-profile-data."""
dgd = self._sglang_dgd()
with patch(f"{_DGD_GEN}._load_profiling_data", return_value={"prefill": {}}):
add_profile_data_to_config(dgd, str(tmp_path), mocker_enabled=False)
for name in ("prefill", "decode"):
args = dgd["spec"]["services"][name]["extraPodSpec"]["mainContainer"][
"args"
]
assert (
"--planner-profile-data" not in args
), f"sglang worker '{name}' should not have --planner-profile-data"
@pytest.mark.pre_merge
@pytest.mark.gpu_0
def test_mocker_enabled_injects_planner_profile_data(self, tmp_path):
"""When mocker is enabled, mocker workers MUST receive --planner-profile-data."""
dgd = self._sglang_dgd()
with patch(f"{_DGD_GEN}._load_profiling_data", return_value={"prefill": {}}):
add_profile_data_to_config(dgd, str(tmp_path), mocker_enabled=True)
for name in ("prefill", "decode"):
args = dgd["spec"]["services"][name]["extraPodSpec"]["mainContainer"][
"args"
]
assert (
"--planner-profile-data" in args
), f"mocker worker '{name}' should have --planner-profile-data"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Regression tests: naive fallback resolved_backend propagation (bug fix) # Regression tests: naive fallback resolved_backend propagation (bug fix)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment