feat: support PVC model cache in profiler (#5124)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: support PVC model cache in profiler (#5124)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
fbe6bb0a · Hongkuan Zhou · GitHub · 007c5b60 · fbe6bb0a · fbe6bb0a
Unverified Commit fbe6bb0a authored Jan 05, 2026 by Hongkuan Zhou Committed by GitHub Jan 05, 2026
7 changed files
--- a/benchmarks/profiler/utils/config_modifiers/protocol.py
+++ b/benchmarks/profiler/utils/config_modifiers/protocol.py
@@ -13,8 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Protocol
+from __future__ import annotations
+from typing import Any, Protocol
+from benchmarks.profiler.utils.config import (
+    Config,
+    Container,
+    PodSpec,
+    break_arguments,
+    get_service_name_by_type,
+    set_argument_value,
+)
 from benchmarks.profiler.utils.defaults import EngineType
 from dynamo.planner.defaults import SubComponentType
@@ -87,9 +97,266 @@ class ConfigModifierProtocol(Protocol):
        ...
    @classmethod
-    def update_model(cls, config: dict, model_name: str) -> dict:
+    def update_model(
+        cls, config: dict, model_name: str, model_path: str | None = None
+    ) -> dict:
        ...
    @classmethod
    def update_image(cls, config: dict, image: str) -> dict:
        ...
+    @classmethod
+    def update_model_from_pvc(
+        cls,
+        config: dict,
+        model_name: str,
+        pvc_name: str,
+        pvc_mount_path: str,
+        pvc_path: str,
+    ) -> dict:
+        ...
+class BaseConfigModifier:
+    """
+    Shared helper base class for profiler config modifiers.
+    This class intentionally lives in `protocol.py` so all backends can inherit
+    common PVC + volumeMount + frontend CLI patching behavior.
+    """
+    # Subclasses should override, e.g. "vllm" / "sglang" / "trtllm"
+    BACKEND: str = ""
+    # Worker CLI arg name for model path / name. vLLM uses "--model"; others use "--model-path".
+    WORKER_MODEL_PATH_ARG: str = "--model-path"
+    WORKER_SERVED_MODEL_NAME_ARG: str = "--served-model-name"
+    @classmethod
+    def _normalize_model_path(cls, pvc_mount_path: str, pvc_path: str) -> str:
+        mount = (pvc_mount_path or "").rstrip("/")
+        sub = (pvc_path or "").lstrip("/")
+        if not sub:
+            return mount
+        return f"{mount}/{sub}"
+    @classmethod
+    def _ensure_spec_pvc(cls, cfg: Config, pvc_name: str) -> None:
+        pvcs = getattr(cfg.spec, "pvcs", None)
+        if pvcs is None:
+            pvcs = []
+        for pvc in pvcs:
+            if isinstance(pvc, dict) and pvc.get("name") == pvc_name:
+                # Ensure create is false (do not create PVC in profiling flows)
+                pvc["create"] = False
+                setattr(cfg.spec, "pvcs", pvcs)
+                return
+        pvcs.append({"name": pvc_name, "create": False})
+        setattr(cfg.spec, "pvcs", pvcs)
+    @classmethod
+    def _ensure_service_volume_mount(
+        cls, service: Any, pvc_name: str, mount_path: str
+    ) -> None:
+        volume_mounts = getattr(service, "volumeMounts", None)
+        if volume_mounts is None:
+            volume_mounts = []
+        if not isinstance(volume_mounts, list):
+            volume_mounts = []
+        for vm in volume_mounts:
+            if isinstance(vm, dict) and vm.get("name") == pvc_name:
+                vm["mountPoint"] = mount_path
+                setattr(service, "volumeMounts", volume_mounts)
+                return
+        volume_mounts.append({"name": pvc_name, "mountPoint": mount_path})
+        setattr(service, "volumeMounts", volume_mounts)
+    @classmethod
+    def _update_container_args_preserving_shell_form(
+        cls, container: Container, update_fn
+    ) -> None:
+        """
+        Update container args while preserving a common shell form:
+        - If `command` is `sh -c` and args is a single-string list, keep it that way.
+        """
+        original_args = container.args
+        cmd = container.command or []
+        is_shell_c = (
+            isinstance(cmd, list)
+            and len(cmd) >= 2
+            and cmd[0] in ("/bin/sh", "sh")
+            and cmd[1] == "-c"
+        )
+        is_single_string_args = (
+            isinstance(original_args, list)
+            and len(original_args) == 1
+            and isinstance(original_args[0], str)
+        )
+        tokens = break_arguments(original_args)
+        tokens = update_fn(tokens)
+        if is_shell_c and is_single_string_args:
+            # Keep as one string for `sh -c`
+            import shlex
+            container.args = [shlex.join(tokens)]
+        else:
+            container.args = tokens
+    @classmethod
+    def _update_frontend_cli(
+        cls, cfg: Config, model_name: str, model_path: str
+    ) -> None:
+        frontend = cfg.spec.services.get("Frontend")
+        if not frontend:
+            return
+        if frontend.extraPodSpec is None:
+            frontend.extraPodSpec = PodSpec(mainContainer=Container())
+        if frontend.extraPodSpec.mainContainer is None:
+            frontend.extraPodSpec.mainContainer = Container()
+        c = frontend.extraPodSpec.mainContainer
+        # If operator defaults are being used (no command/args), we must provide full CLI.
+        if not c.command and not c.args:
+            c.command = ["python3", "-m", "dynamo.frontend"]
+            c.args = []
+        def _patch(tokens: list[str]) -> list[str]:
+            tokens = set_argument_value(tokens, "--model-name", model_name)
+            tokens = set_argument_value(tokens, "--model-path", model_path)
+            return tokens
+        cls._update_container_args_preserving_shell_form(c, _patch)
+    @classmethod
+    def _apply_model_update_to_cfg(
+        cls,
+        cfg: Config,
+        model_name: str,
+        model_path: str,
+        patch_frontend: bool,
+    ) -> None:
+        """
+        Apply model updates to a validated DGD config object.
+        This is the shared implementation for both:
+        - update_model()
+        - update_model_from_pvc()
+        """
+        # Update workers (prefill + decode) if present.
+        for sct in (SubComponentType.PREFILL, SubComponentType.DECODE):
+            try:
+                svc_name = get_service_name_by_type(cfg, cls.BACKEND, sct)
+            except Exception:
+                continue
+            if svc_name not in cfg.spec.services:
+                continue
+            service = cfg.spec.services[svc_name]
+            if not service.extraPodSpec or not service.extraPodSpec.mainContainer:
+                continue
+            c = service.extraPodSpec.mainContainer
+            def _patch(tokens: list[str]) -> list[str]:
+                tokens = set_argument_value(
+                    tokens, cls.WORKER_MODEL_PATH_ARG, model_path
+                )
+                tokens = set_argument_value(
+                    tokens, cls.WORKER_SERVED_MODEL_NAME_ARG, model_name
+                )
+                return tokens
+            cls._update_container_args_preserving_shell_form(c, _patch)
+        if patch_frontend:
+            cls._update_frontend_cli(cfg, model_name=model_name, model_path=model_path)
+    @classmethod
+    def update_model(
+        cls, config: dict, model_name: str, model_path: str | None = None
+    ) -> dict:
+        """
+        Unified model update API.
+        Args:
+            config: DGD config dict
+            model_name: served model name (HF id)
+            model_path: model path inside container (if using PVC/local path). If omitted,
+                defaults to model_name (HF download case for workers).
+        """
+        cfg = Config.model_validate(config)
+        if model_path is None:
+            model_path = model_name
+        # Frontend requires a real filesystem path (validate_model_path checks isdir),
+        # so only inject model args when `model_path` looks like a path.
+        patch_frontend = bool(
+            isinstance(model_path, str)
+            and (model_path.startswith("/") or model_path.startswith("."))
+        )
+        cls._apply_model_update_to_cfg(
+            cfg,
+            model_name=model_name,
+            model_path=model_path,
+            patch_frontend=patch_frontend,
+        )
+        return cfg.model_dump()
+    @classmethod
+    def update_model_from_pvc(
+        cls,
+        config: dict,
+        model_name: str,
+        pvc_name: str,
+        pvc_mount_path: str,
+        pvc_path: str,
+    ) -> dict:
+        """
+        Update a DGD config to serve `model_name`, with weights located in a mounted PVC.
+        Common steps across backends:
+        - Add `spec.pvcs`
+        - Add `volumeMounts` for Frontend + prefill + decode (if present)
+        - Patch Frontend CLI (`--model-name`, `--model-path`)
+        - Delegate worker CLI patching to backend-specific implementation.
+        """
+        if not pvc_name:
+            return config
+        cfg = Config.model_validate(config)
+        model_path = cls._normalize_model_path(pvc_mount_path, pvc_path)
+        cls._ensure_spec_pvc(cfg, pvc_name)
+        # Mount to Frontend + prefill + decode services if present.
+        if "Frontend" in cfg.spec.services:
+            cls._ensure_service_volume_mount(
+                cfg.spec.services["Frontend"], pvc_name, pvc_mount_path
+            )
+        for sct in (SubComponentType.PREFILL, SubComponentType.DECODE):
+            svc_name = get_service_name_by_type(cfg, cls.BACKEND, sct)
+            if svc_name in cfg.spec.services:
+                cls._ensure_service_volume_mount(
+                    cfg.spec.services[svc_name], pvc_name, pvc_mount_path
+                )
+        # Patch workers + frontend with PVC model path.
+        cls._apply_model_update_to_cfg(
+            cfg,
+            model_name=model_name,
+            model_path=model_path,
+            patch_frontend=True,
+        )
+        return cfg.model_dump()
--- a/benchmarks/profiler/utils/config_modifiers/sglang.py
+++ b/benchmarks/profiler/utils/config_modifiers/sglang.py
@@ -18,6 +18,7 @@ from benchmarks.profiler.utils.config import (
    update_image,
    validate_and_get_worker_args,
 )
+from benchmarks.profiler.utils.config_modifiers.protocol import BaseConfigModifier
 from benchmarks.profiler.utils.defaults import (
    DEFAULT_MODEL_NAME,
    DYNAMO_RUN_DEFAULT_PORT,
@@ -39,40 +40,14 @@ logger.addHandler(console_handler)
 DEFAULT_SGLANG_CONFIG_PATH = "examples/backends/sglang/deploy/disagg.yaml"
-class SGLangConfigModifier:
+class SGLangConfigModifier(BaseConfigModifier):
+    BACKEND = "sglang"
    @classmethod
    def load_default_config(cls) -> dict:
        with open(DEFAULT_SGLANG_CONFIG_PATH, "r") as f:
            return yaml.safe_load(f)
-    @classmethod
-    def update_model(cls, config, model_name: str) -> dict:
-        # change the model to serve
-        cfg = Config.model_validate(config)
-        # Update model for both prefill and decode workers
-        for sub_component_type in [SubComponentType.PREFILL, SubComponentType.DECODE]:
-            try:
-                worker_service = get_worker_service_from_config(
-                    cfg, backend="sglang", sub_component_type=sub_component_type
-                )
-                args = validate_and_get_worker_args(worker_service, backend="sglang")
-                args = break_arguments(args)
-                # Update both --model-path and --served-model-name
-                args = set_argument_value(args, "--model-path", model_name)
-                args = set_argument_value(args, "--served-model-name", model_name)
-                worker_service.extraPodSpec.mainContainer.args = args
-            except (ValueError, KeyError):
-                # Service might not exist (e.g., in aggregated mode)
-                logger.debug(
-                    f"Skipping {sub_component_type} service as it doesn't exist"
-                )
-                continue
-        return cfg.model_dump()
    @classmethod
    def update_image(cls, config, image: str) -> dict:
        """Update container image for all DGD services (frontend, planner, workers)."""
@@ -292,7 +267,8 @@ class SGLangConfigModifier:
        args = remove_valued_arguments(args, "--data-parallel-size")
        # 3. Enable --enable-dp-attention
-        args = append_argument(args, "--enable-dp-attention")
+        if "--enable-dp-attention" not in args:
+            args = append_argument(args, "--enable-dp-attention")
        # 4. Set --ep=dep_size (expert parallelism size)
        args = set_argument_value(args, "--ep", str(dep_size))

--- a/benchmarks/profiler/utils/config_modifiers/trtllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/trtllm.py
@@ -15,11 +15,11 @@ from benchmarks.profiler.utils.config import (
    get_worker_service_from_config,
    parse_override_engine_args,
    remove_valued_arguments,
-    set_argument_value,
    setup_worker_service_resources,
    update_image,
    validate_and_get_worker_args,
 )
+from benchmarks.profiler.utils.config_modifiers.protocol import BaseConfigModifier
 from benchmarks.profiler.utils.defaults import (
    DEFAULT_MODEL_NAME,
    DYNAMO_RUN_DEFAULT_PORT,
@@ -41,40 +41,14 @@ logger.addHandler(console_handler)
 DEFAULT_TRTLLM_CONFIG_PATH = "examples/backends/trtllm/deploy/disagg.yaml"
-class TrtllmConfigModifier:
+class TrtllmConfigModifier(BaseConfigModifier):
+    BACKEND = "trtllm"
    @classmethod
    def load_default_config(cls) -> dict:
        with open(DEFAULT_TRTLLM_CONFIG_PATH, "r") as f:
            return yaml.safe_load(f)
-    @classmethod
-    def update_model(cls, config, model_name: str) -> dict:
-        # change the model to serve
-        cfg = Config.model_validate(config)
-        # Update model for both prefill and decode workers
-        for sub_component_type in [SubComponentType.PREFILL, SubComponentType.DECODE]:
-            try:
-                worker_service = get_worker_service_from_config(
-                    cfg, backend="trtllm", sub_component_type=sub_component_type
-                )
-                args = validate_and_get_worker_args(worker_service, backend="trtllm")
-                args = break_arguments(args)
-                # Update both --model-path and --served-model-name
-                args = set_argument_value(args, "--model-path", model_name)
-                args = set_argument_value(args, "--served-model-name", model_name)
-                worker_service.extraPodSpec.mainContainer.args = args
-            except (ValueError, KeyError):
-                # Service might not exist (e.g., in aggregated mode)
-                logger.debug(
-                    f"Skipping {sub_component_type} service as it doesn't exist"
-                )
-                continue
-        return cfg.model_dump()
    @classmethod
    def update_image(cls, config, image: str) -> dict:
        """Update container image for all DGD services (frontend, planner, workers)."""

--- a/benchmarks/profiler/utils/config_modifiers/vllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/vllm.py
@@ -16,6 +16,7 @@ from benchmarks.profiler.utils.config import (
    update_image,
    validate_and_get_worker_args,
 )
+from benchmarks.profiler.utils.config_modifiers.protocol import BaseConfigModifier
 from benchmarks.profiler.utils.defaults import (
    DEFAULT_MODEL_NAME,
    DYNAMO_RUN_DEFAULT_PORT,
@@ -37,39 +38,16 @@ logger.addHandler(console_handler)
 DEFAULT_VLLM_CONFIG_PATH = "examples/backends/vllm/deploy/disagg.yaml"
-class VllmV1ConfigModifier:
+class VllmV1ConfigModifier(BaseConfigModifier):
+    BACKEND = "vllm"
+    # vllm uses a different arg for model path
+    WORKER_MODEL_PATH_ARG = "--model"
    @classmethod
    def load_default_config(cls) -> dict:
        with open(DEFAULT_VLLM_CONFIG_PATH, "r") as f:
            return yaml.safe_load(f)
-    @classmethod
-    def update_model(cls, config, model_name: str) -> dict:
-        # change the model to serve
-        cfg = Config.model_validate(config)
-        # Update model for both prefill and decode workers
-        for sub_component_type in [SubComponentType.PREFILL, SubComponentType.DECODE]:
-            try:
-                worker_service = get_worker_service_from_config(
-                    cfg, backend="vllm", sub_component_type=sub_component_type
-                )
-                args = validate_and_get_worker_args(worker_service, backend="vllm")
-                args = break_arguments(args)
-                # Update --model (vllm uses --model instead of --model-path and --served-model-name)
-                args = set_argument_value(args, "--model", model_name)
-                worker_service.extraPodSpec.mainContainer.args = args
-            except (ValueError, KeyError):
-                # Service might not exist (e.g., in aggregated mode)
-                logger.debug(
-                    f"Skipping {sub_component_type} service as it doesn't exist"
-                )
-                continue
-        return cfg.model_dump()
    @classmethod
    def update_image(cls, config, image: str) -> dict:
        """Update container image for all DGD services (frontend, planner, workers)."""

--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
@@ -66,7 +66,13 @@ def create_profiler_parser() -> argparse.Namespace:
        deployment:
            namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
            service_name: String (service name, default: "")
-            model: String (model to serve, can be HF model name or local model path)
+            model: String (served model name)
+            model_cache_pvc_name: String (name of the PVC to mount the model cache,
+                if not provided, model must be HF name and will download from HF, default: "")
+            model_cache_pvc_path: String (path to the model cache in the PVC, default: "")
+            model_cache_pvc_mount_path: String (path to the model cache in the container,
+                note that the PVC must be mounted to the same path for the profiling job,
+                default: "/opt/model-cache")
        engine:
            backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
            config: String (path to the DynamoGraphDeployment config file, default: "")
@@ -122,7 +128,27 @@ def create_profiler_parser() -> argparse.Namespace:
        "--model",
        type=str,
        default=config.get("deployment", {}).get("model", ""),
-        help="Model to serve, can be HF model name or local model path",
+        help="Served model name",
+    )
+    parser.add_argument(
+        "--model-cache-pvc-name",
+        type=str,
+        default=config.get("deployment", {}).get("model_cache_pvc_name", ""),
+        help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF",
+    )
+    parser.add_argument(
+        "--model-cache-pvc-path",
+        type=str,
+        default=config.get("deployment", {}).get("model_cache_pvc_path", ""),
+        help="Path to the model cache in the PVC",
+    )
+    parser.add_argument(
+        "--model-cache-pvc-mount-path",
+        type=str,
+        default=config.get("deployment", {}).get(
+            "model_cache_pvc_mount_path", "/opt/model-cache"
+        ),
+        help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job",
    )
    parser.add_argument(
        "--dgd-image",

--- a/benchmarks/profiler/utils/search_space_autogen.py
+++ b/benchmarks/profiler/utils/search_space_autogen.py
@@ -44,7 +44,17 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
    if args.model:
        logger.info(f"Updating model in DGD config file to {args.model}")
-        config = config_modifier.update_model(config, args.model)
+        if args.model_cache_pvc_name:
+            config = config_modifier.update_model_from_pvc(
+                config,
+                args.model,
+                args.model_cache_pvc_name,
+                args.model_cache_pvc_mount_path,
+                args.model_cache_pvc_path,
+            )
+        else:
+            # Non-PVC: workers download from HF, so model_path == model_name
+            config = config_modifier.update_model(config, args.model, args.model)
        if args.dgd_image:
            logger.info(f"Updating DGD image to {args.dgd_image}")
            config = config_modifier.update_image(config, args.dgd_image)
@@ -58,11 +68,30 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
    # get model info and update args
    model_info: ModelInfo | None = None
-    if not args.model:
+    model_name_or_path = ""
+    if args.model:
+        # prioritize using model cache in PVC over downloading from HF
+        if args.model_cache_pvc_name:
+            # Keep consistent path normalization with config mutation logic
+            model_name_or_path = config_modifier._normalize_model_path(
+                args.model_cache_pvc_mount_path, args.model_cache_pvc_path
+            )
+        else:
+            model_name_or_path = args.model
+    else:
        # get the model name from config
        args.model = config_modifier.get_model_name(config)
-    logger.info(f"Getting model info for {args.model}...")
+        model_name_or_path = args.model
-    model_info = get_model_info(args.model)
+    logger.info(f"Getting model info for {args.model} at {model_name_or_path}...")
+    try:
+        model_info = get_model_info(model_name_or_path)
+    except Exception as e:
+        # Common in dry-run mode when the PVC isn't mounted locally.
+        logger.warning(
+            f"Failed to load model info from local path '{model_name_or_path}': {e}. "
+            f"Trying to download from HF for '{args.model}'."
+        )
+        model_info = get_model_info(args.model)
    num_experts_str = (
        f", num_experts={model_info.num_experts}"

--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -74,6 +74,9 @@ class TestProfileSLADryRun:
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
@@ -118,6 +121,9 @@ class TestProfileSLADryRun:
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
@@ -183,6 +189,9 @@ class TestProfileSLADryRun:
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
@@ -237,6 +246,10 @@ class TestProfileSLADryRun:
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                # Added in newer profiler versions; keep Args compatible with search_space_autogen
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
@@ -315,6 +328,9 @@ class TestProfileSLADryRun:
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.enable_gpu_discovery = True
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
        return Args()
@@ -383,6 +399,9 @@ class TestProfileSLADryRun:
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.enable_gpu_discovery = True
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
        return Args()
@@ -451,6 +470,9 @@ class TestProfileSLADryRun:
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.enable_gpu_discovery = True
+                self.model_cache_pvc_name = ""
+                self.model_cache_pvc_path = ""
+                self.model_cache_pvc_mount_path = "/opt/model-cache"
        return Args()