Unverified Commit a20b9dde authored by Alec's avatar Alec Committed by GitHub
Browse files

feat(container): add standalone dynamo-planner image [DYN-2533] (#7696)

parent c4ef45bb
......@@ -116,6 +116,7 @@ deploy:
- 'tests/deploy/**'
planner:
- 'container/templates/planner.Dockerfile'
- 'components/src/dynamo/planner/**'
- 'components/src/dynamo/global_planner/**'
- 'tests/planner/**'
......
......@@ -84,13 +84,13 @@ def _parse_config() -> PlannerConfig:
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
config = _parse_config()
async def worker(runtime: DistributedRuntime, config: PlannerConfig):
await init_planner(runtime, config)
def main():
asyncio.run(worker()) # type: ignore[call-arg]
config = _parse_config()
asyncio.run(worker(config)) # type: ignore[call-arg]
if __name__ == "__main__":
......
......@@ -102,7 +102,7 @@ class DgdPlannerServiceConfig(BaseModel):
replicas: int = 1
extraPodSpec: PodSpec = PodSpec(
mainContainer=Container(
image="my-registry/dynamo-runtime:my-tag", # placeholder
image="my-registry/dynamo-planner:my-tag", # placeholder
workingDir=f"{get_workspace_dir()}/components/src/dynamo/planner",
command=["python3", "-m", "dynamo.planner"],
args=[],
......@@ -397,7 +397,7 @@ def set_argument_value(args: list[str], arg_name: str, value: str) -> list[str]:
def update_image(config: dict, image: str) -> dict:
"""Update container image for all DGD services (frontend, planner, workers).
"""Update container image for non-planner DGD services.
This is a shared utility function used by all backend config modifiers.
......@@ -410,8 +410,9 @@ def update_image(config: dict, image: str) -> dict:
"""
cfg = Config.model_validate(config)
# Update image for all services
for service_name, service_config in cfg.spec.services.items():
if getattr(service_config, "componentType", None) == "planner":
continue
if service_config.extraPodSpec and service_config.extraPodSpec.mainContainer:
service_config.extraPodSpec.mainContainer.image = image
logger.debug(f"Updated image for {service_name} to {image}")
......
......@@ -28,6 +28,7 @@ from dynamo.planner.config.planner_config import PlannerConfig
from dynamo.profiler.utils.config import DgdPlannerServiceConfig, set_argument_value
from dynamo.profiler.utils.profile_common import (
ProfilerOperationalConfig,
derive_planner_image,
is_mocker_enabled,
is_planner_enabled,
needs_profile_data,
......@@ -180,8 +181,10 @@ def add_planner_to_config(
planner_cfg.profile_results_dir = PROFILE_DATA_MOUNT
planner_service = DgdPlannerServiceConfig()
if planner_service.extraPodSpec.mainContainer:
planner_service.extraPodSpec.mainContainer.image = dgdr.image
if planner_service.extraPodSpec.mainContainer and dgdr.image:
planner_service.extraPodSpec.mainContainer.image = derive_planner_image(
dgdr.image
)
planner_dict = planner_service.model_dump(exclude_unset=False)
......
......@@ -45,6 +45,29 @@ BACKEND_IMAGE_NAMES: dict[str, str] = {
"trtllm": "tensorrtllm-runtime",
}
PLANNER_IMAGE_NAME = "dynamo-planner"
def _replace_image_name(image_ref: str, new_name: str) -> str:
"""Replace the image name component in a Docker image reference.
Preserves the registry path prefix and tag suffix, only replacing the
last ``/``-delimited component (before any ``:tag``).
"""
slash_idx = image_ref.rfind("/")
prefix = image_ref[: slash_idx + 1] if slash_idx >= 0 else ""
suffix = image_ref[slash_idx + 1 :]
name_and_tag, has_digest, digest = suffix.partition("@")
colon_idx = name_and_tag.rfind(":")
tag = name_and_tag[colon_idx:] if colon_idx >= 0 else ""
digest_suffix = f"@{digest}" if has_digest else ""
return f"{prefix}{new_name}{tag}{digest_suffix}"
def derive_planner_image(profiler_image: str) -> str:
"""Derive the planner service image from the profiler image reference."""
return _replace_image_name(profiler_image, PLANNER_IMAGE_NAME)
def derive_backend_image(profiler_image: str, backend: str) -> str:
"""Derive the backend worker image from the profiler image.
......@@ -82,14 +105,7 @@ def derive_backend_image(profiler_image: str, backend: str) -> str:
f"Supported backends: {list(BACKEND_IMAGE_NAMES.keys())}"
)
# Split off the last path component: "registry/path/name:tag" → "name:tag"
slash_idx = profiler_image.rfind("/")
prefix = profiler_image[: slash_idx + 1] if slash_idx >= 0 else ""
suffix = profiler_image[slash_idx + 1 :]
colon_idx = suffix.find(":")
tag = suffix[colon_idx:] if colon_idx >= 0 else ""
return f"{prefix}{backend_image_name}{tag}"
return _replace_image_name(profiler_image, backend_image_name)
# ---------------------------------------------------------------------------
......
......@@ -13,6 +13,10 @@
{% include "templates/dynamo_base.Dockerfile" %}
{% include "templates/wheel_builder.Dockerfile" %}
{% include "templates/frontend.Dockerfile" %}
{% elif target == "planner" %}
{% include "templates/dynamo_base.Dockerfile" %}
{% include "templates/wheel_builder.Dockerfile" %}
{% include "templates/planner.Dockerfile" %}
{% elif target == "runtime" or target == "dev" or target == "local-dev" %}
{% include "templates/dynamo_base.Dockerfile" %}
{% include "templates/wheel_builder.Dockerfile" %}
......
......@@ -18,6 +18,10 @@ dynamo:
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1
frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619
planner_build_image: python
planner_build_image_tag: 3.12-slim
planner_runtime_image: nvcr.io/nvidia/distroless/python
planner_runtime_image_tag: 3.12-v4.0.3
python_version: "3.12"
nats_version: v2.10.28
......
......@@ -8,7 +8,7 @@ so each image only installs what it needs.
| File | Purpose |
|------|---------|
| `requirements.common.txt` | Core deps shared by all containers |
| `requirements.planner.txt` | Planner, profiler, global_planner, deploy utils |
| `requirements.planner.txt` | Planner, profiler, and global_planner deps |
| `requirements.frontend.txt` | Frontend deps |
| `requirements.vllm.txt` | vLLM-specific deps |
| `requirements.benchmark.txt` | Benchmark and profiling tools |
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Dependencies for planner, profiler, global_planner, and deploy utils.
# Dependencies required by the planner, profiler, and global_planner services.
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@5d419f99d60fdae0d3911cba06a9b571f3b2965c
aiofiles<=25.1.0
......
......@@ -140,6 +140,7 @@ def validate_args(args):
"dev",
"local-dev",
"frontend",
"planner",
"wheel_builder",
"base",
],
......
......@@ -81,6 +81,13 @@ ARG EPP_IMAGE={{ context.dynamo.epp_image }}
ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }}
{% endif %}
{% if target == "planner" %}
ARG PLANNER_BUILD_IMAGE={{ context.dynamo.planner_build_image }}
ARG PLANNER_BUILD_IMAGE_TAG={{ context.dynamo.planner_build_image_tag }}
ARG PLANNER_RUNTIME_IMAGE={{ context.dynamo.planner_runtime_image }}
ARG PLANNER_RUNTIME_IMAGE_TAG={{ context.dynamo.planner_runtime_image_tag }}
{% endif %}
{% if framework == "vllm" -%}
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF={{ context[framework][device_key].vllm_ref }}
......
{#
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#}
# === BEGIN templates/planner.Dockerfile ===
##############################################
########## Planner / Profiler image ##########
##############################################
# Standalone planner/profiler image:
# - install deps in a slim builder stage that has git/git-lfs available
# - ship only the runtime artifacts in a distroless final stage
FROM ${PLANNER_BUILD_IMAGE}:${PLANNER_BUILD_IMAGE_TAG} AS planner_builder
ARG PYTHON_VERSION
# Install only the packages needed to resolve and install the planner runtime
# dependencies in the builder stage. git/git-lfs are only needed because
# aiconfigurator is currently installed from a Git URL with LFS-backed assets.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
git \
git-lfs \
libgomp1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Create dynamo user with group 0 for OpenShift compatibility.
RUN useradd -m -s /bin/bash -g 0 dynamo \
&& [ `id -u dynamo` -eq 1000 ] \
&& mkdir -p /home/dynamo/.cache /opt/dynamo /workspace \
&& chown -R dynamo:0 /home/dynamo /opt/dynamo /workspace \
&& chmod -R g+w /home/dynamo/.cache /opt/dynamo /workspace
ENV HOME=/home/dynamo \
VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:/usr/local/bin/etcd:/usr/local/bin:/bin" \
PYTHONPATH="/workspace"
WORKDIR /workspace
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
COPY --from=dynamo_base /usr/local/bin/nats-server /usr/local/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd /usr/local/bin/etcd
COPY --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
USER dynamo
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
export UV_CACHE_DIR=/home/dynamo/.cache/uv && \
uv venv ${VIRTUAL_ENV} --python ${PYTHON_VERSION}
# Install the local wheels and planner/profiler runtime dependencies before the
# repo copies so changes in tests/configs don't invalidate the dependency layer.
RUN --mount=type=bind,source=./container/deps/requirements.planner.txt,target=/tmp/requirements.planner.txt \
--mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
export UV_CACHE_DIR=/home/dynamo/.cache/uv UV_GIT_LFS=1 UV_HTTP_TIMEOUT=300 UV_HTTP_RETRIES=5 && \
uv pip install \
--requirement /tmp/requirements.planner.txt \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl
# Copy only the subset of the repository needed for planner/profiler service
# startup and targeted planner/profiler unit tests.
COPY --chmod=664 --chown=dynamo:0 pyproject.toml /workspace/pyproject.toml
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/planner /workspace/components/src/dynamo/planner
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/profiler /workspace/components/src/dynamo/profiler
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/global_planner /workspace/components/src/dynamo/global_planner
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
FROM ${PLANNER_RUNTIME_IMAGE}:${PLANNER_RUNTIME_IMAGE_TAG} AS planner
COPY --from=planner_builder /etc/group /etc/passwd /etc/
COPY --from=planner_builder /bin/dash /usr/bin/sh
COPY --from=planner_builder /bin/uv /bin/uvx /usr/local/bin/
COPY --chown=1000:0 --from=planner_builder /home/dynamo /home/dynamo
COPY --chown=1000:0 --from=planner_builder /opt/dynamo/venv /opt/dynamo/venv
COPY --from=planner_builder /usr/lib/*-linux-gnu/libgomp.so.1* /opt/dynamo/lib/
COPY --from=planner_builder /usr/local/bin/etcd /usr/local/bin/etcd
COPY --from=planner_builder /usr/local/bin/nats-server /usr/local/bin/nats-server
COPY --chown=1000:0 --from=planner_builder /workspace /workspace
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=${DYNAMO_COMMIT_SHA} \
HOME=/home/dynamo \
VIRTUAL_ENV=/opt/dynamo/venv \
LD_LIBRARY_PATH="/opt/dynamo/lib" \
PATH="/opt/dynamo/venv/bin:/usr/local/bin/etcd:/usr/local/bin:/bin" \
PYTHONPATH="/workspace"
WORKDIR /workspace
USER dynamo
CMD []
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import sys
from pathlib import Path
import pytest
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "components" / "src"))
pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0, pytest.mark.unit]
try:
from dynamo.profiler.utils.config import update_image
from dynamo.profiler.utils.dgd_generation import add_planner_to_config
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
HardwareSpec,
SLASpec,
WorkloadSpec,
)
from dynamo.profiler.utils.profile_common import (
derive_backend_image,
derive_planner_image,
)
except ImportError as e:
pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
def _make_dgdr(image: str) -> DynamoGraphDeploymentRequestSpec:
return DynamoGraphDeploymentRequestSpec(
model="Qwen/Qwen3-32B",
backend="trtllm",
image=image,
hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
workload=WorkloadSpec(isl=4000, osl=1000),
sla=SLASpec(ttft=2000.0, itl=50.0),
)
def _base_dgd_config(image: str) -> dict:
return {
"metadata": {"name": "test-dgd"},
"spec": {
"services": {
"Frontend": {
"replicas": 1,
"extraPodSpec": {
"mainContainer": {
"image": image,
"args": ["serve"],
}
},
}
}
},
}
@pytest.mark.parametrize(
("image", "expected"),
[
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3",
"nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.2.3",
),
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend@sha256:deadbeef",
"nvcr.io/nvidia/ai-dynamo/dynamo-planner@sha256:deadbeef",
),
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3@sha256:deadbeef",
"nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.2.3@sha256:deadbeef",
),
],
)
def test_derive_planner_image_preserves_registry_tag_and_digest(
image: str, expected: str
):
assert derive_planner_image(image) == expected
@pytest.mark.parametrize(
("image", "backend", "expected"),
[
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3",
"vllm",
"nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.2.3",
),
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend@sha256:deadbeef",
"sglang",
"nvcr.io/nvidia/ai-dynamo/sglang-runtime@sha256:deadbeef",
),
(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3@sha256:deadbeef",
"trtllm",
"nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.2.3@sha256:deadbeef",
),
],
)
def test_derive_backend_image_preserves_registry_tag_and_digest(
image: str, backend: str, expected: str
):
assert derive_backend_image(image, backend) == expected
def test_add_planner_to_config_uses_dynamo_planner_image():
image = "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3"
dgdr = _make_dgdr(image)
config = _base_dgd_config(image)
add_planner_to_config(dgdr, config)
planner_image = config["spec"]["services"]["Planner"]["extraPodSpec"][
"mainContainer"
]["image"]
assert planner_image == "nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.2.3"
def test_update_image_does_not_overwrite_planner_service_image():
profiler_image = "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.2.3"
worker_image = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.2.3"
dgdr = _make_dgdr(profiler_image)
config = _base_dgd_config(profiler_image)
add_planner_to_config(dgdr, config)
updated = update_image(config, worker_image)
assert (
updated["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"][
"image"
]
== worker_image
)
assert (
updated["spec"]["services"]["Planner"]["extraPodSpec"]["mainContainer"]["image"]
== "nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.2.3"
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment