Unverified Commit e8ecf6ff authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix(planner): use dynamo-planner image for profiler job and planner pods...


fix(planner): use dynamo-planner image for profiler job and planner pods [DYN-2733][DYN-2746] (#8407)
Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.7 (1M context) <noreply@anthropic.com>
parent f923777e
......@@ -1253,6 +1253,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
// Use image from spec; the defaulting webhook fills this in for production builds.
// Guard against empty image in case the webhook didn't run (e.g. local dev builds).
//
// Starting with Dynamo 1.1.0, the profiler's runtime dependencies
// (kubernetes_asyncio, pmdarima, prophet, aiconfigurator, ...) live in the
// dedicated dynamo-planner image, not in backend runtime or frontend images.
// Users on 1.1.0+ must set spec.image to a planner image
// (e.g. nvcr.io/nvidia/ai-dynamo/dynamo-planner:<version>); earlier versions
// can continue using the frontend/backend image they were using before.
imageName := dgdr.Spec.Image
if imageName == "" {
return nil, false, fmt.Errorf("spec.image is required but not set; ensure the defaulting webhook ran or set spec.image explicitly")
......
......@@ -35,7 +35,13 @@ const (
// defaultImage is the default profiler image used when spec.image is not set.
// Default image derivation is only supported for public release versions (1.0.0+).
defaultImage = "nvcr.io/nvidia/ai-dynamo/dynamo-frontend"
//
// Starting with Dynamo 1.1.0, the profiler's runtime dependencies
// (kubernetes_asyncio, pmdarima, prophet, aiconfigurator, ...) ship only in the
// dedicated dynamo-planner image, so we default to that image here. Users who
// pin an earlier version may continue to override spec.image explicitly with
// the frontend image they were using before.
defaultImage = "nvcr.io/nvidia/ai-dynamo/dynamo-planner"
)
// DGDRDefaulter is a mutating webhook handler that fills in default values for
......@@ -43,7 +49,7 @@ const (
//
// If spec.image is not set, it is derived as:
//
// nvcr.io/nvidia/ai-dynamo/dynamo-frontend:<operatorVersion>
// nvcr.io/nvidia/ai-dynamo/dynamo-planner:<operatorVersion>
//
// Defaulting requires a known operator version and is only supported for
// operator versions 1.0.0 and later.
......
......@@ -28,6 +28,8 @@ import (
)
func TestDGDRDefaulter_defaultImageFor(t *testing.T) {
// Note: the default planner image is only published starting from Dynamo 1.1.0,
// so these tests use 1.1.0 as the earliest known valid version.
tests := []struct {
name string
operatorVersion string
......@@ -35,13 +37,13 @@ func TestDGDRDefaulter_defaultImageFor(t *testing.T) {
}{
{
name: "known version produces default image",
operatorVersion: "1.0.0",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0",
operatorVersion: "1.1.0",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.1.0",
},
{
name: "pre-release version is valid",
operatorVersion: "1.0.0-rc1",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0-rc1",
operatorVersion: "1.1.0-rc1",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.1.0-rc1",
},
{
name: "unknown operator version cannot be defaulted",
......@@ -85,14 +87,14 @@ func TestDGDRDefaulter_Default(t *testing.T) {
}{
{
name: "CREATE with empty image defaults to operator version",
version: "1.0.0",
version: "1.1.0",
operation: admissionv1.Create,
initialImage: "",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-planner:1.1.0",
},
{
name: "CREATE with preset image is not overwritten",
version: "1.0.0",
version: "1.1.0",
operation: admissionv1.Create,
initialImage: "my-registry/my-image:custom",
expectedImage: "my-registry/my-image:custom",
......@@ -106,7 +108,7 @@ func TestDGDRDefaulter_Default(t *testing.T) {
},
{
name: "UPDATE does not default image",
version: "1.0.0",
version: "1.1.0",
operation: admissionv1.Update,
initialImage: "",
expectedImage: "",
......
......@@ -80,8 +80,12 @@ spec:
model: Qwen/Qwen3-0.6B
# Container image for the profiling job — must match your installed platform version.
# This is the same dynamo-frontend image used by the deployed inference service.
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:${RELEASE_VERSION}"
# Dynamo >= 1.1.0: use the dedicated planner/profiler image (dynamo-planner).
# Planner/profiler runtime deps (kubernetes_asyncio, pmdarima, prophet,
# aiconfigurator, ...) ship only in this image; the frontend and backend
# runtime images do not.
# Dynamo < 1.1.0: use the dynamo-frontend image you deploy with.
image: "nvcr.io/nvidia/ai-dynamo/dynamo-planner:${RELEASE_VERSION}"
```
Apply it (uses `envsubst` to substitute the `RELEASE_VERSION` shell variable into the YAML):
......@@ -95,7 +99,7 @@ envsubst < qwen3-first-model.yaml | kubectl apply -f - -n ${NAMESPACE}
| Field | Required | Default | Purpose |
|---|---|---|---|
| `model` | Yes | — | HuggingFace model ID (e.g. `Qwen/Qwen3-0.6B`) |
| `image` | No | — | Container image for the profiling job (`dynamo-frontend`) |
| `image` | No | — | Container image for the profiling job. For Dynamo ≥ 1.1.0 this must be the `dynamo-planner` image; for earlier versions it is the `dynamo-frontend` image. |
| `backend` | No | `auto` | Inference engine (`auto`, `vllm`, `sglang`, `trtllm`) |
| `searchStrategy` | No | `rapid` | Profiling depth — `rapid` (~30s, AIC simulation) or `thorough` (2–4h, real GPUs) |
| `autoApply` | No | `true` | Automatically create and start the deployment after profiling |
......
......@@ -62,7 +62,14 @@ spec:
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
# Planner image selection:
# Dynamo >= 1.1.0: use the dedicated planner image
# <registry>/dynamo-planner:<version>
# (backend runtime images no longer ship planner runtime deps
# such as kubernetes_asyncio, pmdarima, prophet, aiconfigurator).
# Dynamo < 1.1.0: use the backend runtime image
# <registry>/sglang-runtime:<version>.
image: my-registry/dynamo-planner:my-tag
command:
- python3
- -m
......
......@@ -79,7 +79,14 @@ spec:
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/tensorrtllm-runtime:my-tag
# Planner image selection:
# Dynamo >= 1.1.0: use the dedicated planner image
# <registry>/dynamo-planner:<version>
# (backend runtime images no longer ship planner runtime deps
# such as kubernetes_asyncio, pmdarima, prophet, aiconfigurator).
# Dynamo < 1.1.0: use the backend runtime image
# <registry>/tensorrtllm-runtime:<version>.
image: my-registry/dynamo-planner:my-tag
ports:
- name: metrics
containerPort: 9085
......
......@@ -61,7 +61,14 @@ spec:
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
# Planner image selection:
# Dynamo >= 1.1.0: use the dedicated planner image
# nvcr.io/nvidia/ai-dynamo/dynamo-planner:<version>
# (backend runtime images no longer ship planner runtime deps
# such as kubernetes_asyncio, pmdarima, prophet, aiconfigurator).
# Dynamo < 1.1.0: use the backend runtime image
# nvcr.io/nvidia/ai-dynamo/vllm-runtime:<version>.
image: nvcr.io/nvidia/ai-dynamo/dynamo-planner:my-tag
command:
- python3
- -m
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment