# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # ────────────────────────────────────────────────────────────────────────────── # Planner profiling data ConfigMap # # The planner requires pre-deployment profiling data (prefill & decode # interpolation curves) for throughput-based scaling. The values below are # PLACEHOLDERS for Qwen/Qwen3-0.6B — replace them with real measurements # before deploying to production. # # How to obtain real data: # • Run the Dynamo Profiler against your target GPU/model combination. # See docs/components/profiler/profiler-guide.md for instructions. # • Or run manual benchmarks and fill in the arrays below. # # Key format (JSON files mounted into the planner container): # prefill_raw_data.json — prefill_isl (input sequence lengths), # prefill_ttft (time-to-first-token in ms), prefill_thpt_per_gpu # (tokens/s per GPU) — all 1-D arrays of equal length. # decode_raw_data.json — x_kv_usage (KV-cache utilisation fractions), # y_context_length (context lengths), z_itl (inter-token latency in ms, # shape [len(x_kv_usage), len(y_context_length)]), z_thpt_per_gpu # (tokens/s per GPU, same shape), max_kv_tokens (scalar). # ────────────────────────────────────────────────────────────────────────────── apiVersion: v1 kind: ConfigMap metadata: name: planner-profile-data data: prefill_raw_data.json: | { "prefill_isl": [128, 256, 512, 1024, 2048], "prefill_ttft": [12, 18, 30, 55, 105], "prefill_thpt_per_gpu": [9800, 8500, 6200, 3800, 2000] } decode_raw_data.json: | { "x_kv_usage": [0.1, 0.3, 0.5, 0.7, 0.9], "y_context_length": [128, 512, 1024, 2048], "z_itl": [ [5, 6, 7, 9 ], [6, 7, 8, 10], [7, 8, 10, 12], [8, 10, 12, 15], [10, 12, 15, 20] ], "z_thpt_per_gpu": [ [4500, 4000, 3500, 2800], [4200, 3700, 3200, 2500], [3800, 3300, 2800, 2200], [3400, 2900, 2400, 1800], [2800, 2400, 1900, 1400] ], "max_kv_tokens": 32768 } --- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: sglang-disagg-planner spec: services: Frontend: componentType: frontend replicas: 1 extraPodSpec: mainContainer: image: my-registry/sglang-runtime:my-tag Planner: envFromSecret: hf-token-secret componentType: planner replicas: 1 extraPodSpec: mainContainer: image: my-registry/sglang-runtime:my-tag command: - python3 - -m - dynamo.planner args: - --config - '{"environment": "kubernetes", "backend": "sglang", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/profiling_results"}' volumeMounts: - name: planner-profile-data mountPath: /workspace/profiling_results readOnly: true volumes: - name: planner-profile-data configMap: name: planner-profile-data decode: envFromSecret: hf-token-secret componentType: worker subComponentType: decode replicas: 2 resources: limits: gpu: "1" extraPodSpec: mainContainer: image: my-registry/sglang-runtime:my-tag workingDir: /workspace/examples/backends/sglang command: - python3 args: - -m - dynamo.sglang - --model-path - Qwen/Qwen3-0.6B - --served-model-name - Qwen/Qwen3-0.6B - --page-size - "16" - --tp - "1" - --trust-remote-code - --skip-tokenizer-init - --disaggregation-mode - decode - --disaggregation-transfer-backend - nixl - --disaggregation-bootstrap-port - "12345" - --host - "0.0.0.0" prefill: envFromSecret: hf-token-secret componentType: worker subComponentType: prefill replicas: 2 resources: limits: gpu: "1" extraPodSpec: mainContainer: image: my-registry/sglang-runtime:my-tag workingDir: /workspace/examples/backends/sglang command: - python3 args: - -m - dynamo.sglang - --model-path - Qwen/Qwen3-0.6B - --served-model-name - Qwen/Qwen3-0.6B - --page-size - "16" - --tp - "1" - --trust-remote-code - --skip-tokenizer-init - --disaggregation-mode - prefill - --disaggregation-transfer-backend - nixl - --disaggregation-bootstrap-port - "12345" - --host - "0.0.0.0"