Unverified Commit a3cf35c3 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: disagg_planner.yaml using new planner CLI (#6760)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 2cab0f7f
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# ──────────────────────────────────────────────────────────────────────────────
# Planner profiling data ConfigMap
#
# The planner requires pre-deployment profiling data (prefill & decode
# interpolation curves) for throughput-based scaling. The values below are
# PLACEHOLDERS for Qwen/Qwen3-0.6B — replace them with real measurements
# before deploying to production.
#
# How to obtain real data:
# • Run the Dynamo Profiler against your target GPU/model combination.
# See docs/components/profiler/profiler-guide.md for instructions.
# • Or run manual benchmarks and fill in the arrays below.
#
# Key format (JSON files mounted into the planner container):
# prefill_raw_data.json — prefill_isl (input sequence lengths),
# prefill_ttft (time-to-first-token in ms), prefill_thpt_per_gpu
# (tokens/s per GPU) — all 1-D arrays of equal length.
# decode_raw_data.json — x_kv_usage (KV-cache utilisation fractions),
# y_context_length (context lengths), z_itl (inter-token latency in ms,
# shape [len(x_kv_usage), len(y_context_length)]), z_thpt_per_gpu
# (tokens/s per GPU, same shape), max_kv_tokens (scalar).
# ──────────────────────────────────────────────────────────────────────────────
apiVersion: v1
kind: ConfigMap
metadata:
name: planner-profile-data
data:
prefill_raw_data.json: |
{
"prefill_isl": [128, 256, 512, 1024, 2048],
"prefill_ttft": [12, 18, 30, 55, 105],
"prefill_thpt_per_gpu": [9800, 8500, 6200, 3800, 2000]
}
decode_raw_data.json: |
{
"x_kv_usage": [0.1, 0.3, 0.5, 0.7, 0.9],
"y_context_length": [128, 512, 1024, 2048],
"z_itl": [
[5, 6, 7, 9 ],
[6, 7, 8, 10],
[7, 8, 10, 12],
[8, 10, 12, 15],
[10, 12, 15, 20]
],
"z_thpt_per_gpu": [
[4500, 4000, 3500, 2800],
[4200, 3700, 3200, 2500],
[3800, 3300, 2800, 2200],
[3400, 2900, 2400, 1800],
[2800, 2400, 1900, 1400]
],
"max_kv_tokens": 32768
}
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
......@@ -20,16 +74,13 @@ spec:
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
command:
- python3
- -m
- planner_sla
- dynamo.planner
args:
- --environment=kubernetes
- --backend=sglang
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
- --config
- '{"environment": "kubernetes", "backend": "sglang", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/profiling_results"}'
volumeMounts:
- name: planner-profile-data
mountPath: /workspace/profiling_results
......@@ -37,8 +88,6 @@ spec:
volumes:
- name: planner-profile-data
configMap:
# Must be pre-created before deployment by the profiler
# See docs/planner/sla_planner_quickstart.md for more details
name: planner-profile-data
decode:
envFromSecret: hf-token-secret
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# ──────────────────────────────────────────────────────────────────────────────
# Planner profiling data ConfigMap
#
# The planner requires pre-deployment profiling data (prefill & decode
# interpolation curves) for throughput-based scaling. The values below are
# PLACEHOLDERS for Qwen/Qwen3-0.6B — replace them with real measurements
# before deploying to production.
#
# How to obtain real data:
# • Run the Dynamo Profiler against your target GPU/model combination.
# See docs/components/profiler/profiler-guide.md for instructions.
# • Or run manual benchmarks and fill in the arrays below.
#
# Key format (JSON files mounted into the planner container):
# prefill_raw_data.json — prefill_isl (input sequence lengths),
# prefill_ttft (time-to-first-token in ms), prefill_thpt_per_gpu
# (tokens/s per GPU) — all 1-D arrays of equal length.
# decode_raw_data.json — x_kv_usage (KV-cache utilisation fractions),
# y_context_length (context lengths), z_itl (inter-token latency in ms,
# shape [len(x_kv_usage), len(y_context_length)]), z_thpt_per_gpu
# (tokens/s per GPU, same shape), max_kv_tokens (scalar).
# ──────────────────────────────────────────────────────────────────────────────
apiVersion: v1
kind: ConfigMap
metadata:
name: planner-profile-data
data:
prefill_raw_data.json: |
{
"prefill_isl": [128, 256, 512, 1024, 2048],
"prefill_ttft": [12, 18, 30, 55, 105],
"prefill_thpt_per_gpu": [9800, 8500, 6200, 3800, 2000]
}
decode_raw_data.json: |
{
"x_kv_usage": [0.1, 0.3, 0.5, 0.7, 0.9],
"y_context_length": [128, 512, 1024, 2048],
"z_itl": [
[5, 6, 7, 9 ],
[6, 7, 8, 10],
[7, 8, 10, 12],
[8, 10, 12, 15],
[10, 12, 15, 20]
],
"z_thpt_per_gpu": [
[4500, 4000, 3500, 2800],
[4200, 3700, 3200, 2500],
[3800, 3300, 2800, 2200],
[3400, 2900, 2400, 1800],
[2800, 2400, 1900, 1400]
],
"max_kv_tokens": 32768
}
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
......@@ -37,7 +91,6 @@ spec:
extraPodSpec:
mainContainer:
image: my-registry/tensorrtllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
......@@ -45,11 +98,9 @@ spec:
- python3
args:
- -m
- planner_sla
- --environment=kubernetes
- --backend=trtllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
- dynamo.planner
- --config
- '{"environment": "kubernetes", "backend": "trtllm", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/profiling_results"}'
volumeMounts:
- name: planner-profile-data
mountPath: /workspace/profiling_results
......@@ -57,8 +108,6 @@ spec:
volumes:
- name: planner-profile-data
configMap:
# Must be pre-created before deployment by the profiler
# See docs/planner/sla_planner_quickstart.md for more details
name: planner-profile-data
TRTLLMDecodeWorker:
envFromSecret: hf-token-secret
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# ──────────────────────────────────────────────────────────────────────────────
# Planner profiling data ConfigMap
#
# The planner requires pre-deployment profiling data (prefill & decode
# interpolation curves) for throughput-based scaling. The values below are
# PLACEHOLDERS for Qwen/Qwen3-0.6B — replace them with real measurements
# before deploying to production.
#
# How to obtain real data:
# • Run the Dynamo Profiler against your target GPU/model combination.
# See docs/components/profiler/profiler-guide.md for instructions.
# • Or run manual benchmarks and fill in the arrays below.
#
# Key format (JSON files mounted into the planner container):
# prefill_raw_data.json — prefill_isl (input sequence lengths),
# prefill_ttft (time-to-first-token in ms), prefill_thpt_per_gpu
# (tokens/s per GPU) — all 1-D arrays of equal length.
# decode_raw_data.json — x_kv_usage (KV-cache utilisation fractions),
# y_context_length (context lengths), z_itl (inter-token latency in ms,
# shape [len(x_kv_usage), len(y_context_length)]), z_thpt_per_gpu
# (tokens/s per GPU, same shape), max_kv_tokens (scalar).
# ──────────────────────────────────────────────────────────────────────────────
apiVersion: v1
kind: ConfigMap
metadata:
name: planner-profile-data
data:
prefill_raw_data.json: |
{
"prefill_isl": [128, 256, 512, 1024, 2048],
"prefill_ttft": [12, 18, 30, 55, 105],
"prefill_thpt_per_gpu": [9800, 8500, 6200, 3800, 2000]
}
decode_raw_data.json: |
{
"x_kv_usage": [0.1, 0.3, 0.5, 0.7, 0.9],
"y_context_length": [128, 512, 1024, 2048],
"z_itl": [
[5, 6, 7, 9 ],
[6, 7, 8, 10],
[7, 8, 10, 12],
[8, 10, 12, 15],
[10, 12, 15, 20]
],
"z_thpt_per_gpu": [
[4500, 4000, 3500, 2800],
[4200, 3700, 3200, 2500],
[3800, 3300, 2800, 2200],
[3400, 2900, 2400, 1800],
[2800, 2400, 1900, 1400]
],
"max_kv_tokens": 32768
}
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
......@@ -19,16 +73,13 @@ spec:
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
command:
- python3
- -m
- planner_sla
- dynamo.planner
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
- --config
- '{"environment": "kubernetes", "backend": "vllm", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/profiling_results"}'
volumeMounts:
- name: planner-profile-data
mountPath: /workspace/profiling_results
......@@ -36,8 +87,6 @@ spec:
volumes:
- name: planner-profile-data
configMap:
# Must be pre-created before deployment by the profiler
# See docs/planner/sla_planner_quickstart.md for more details
name: planner-profile-data
VllmDecodeWorker:
envFromSecret: hf-token-secret
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment