# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: disagg-planner spec: envs: - name: DYN_DEPLOYMENT_CONFIG value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' services: Frontend: dynamoNamespace: vllm-v0-disagg-planner componentType: main replicas: 1 resources: requests: cpu: "2" memory: "4Gi" limits: cpu: "2" memory: "4Gi" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo - serve - graphs.disagg_planner:Frontend - --system-app-port - "5000" - --enable-system-app - --use-default-health-checks - --service-name - Frontend VllmWorker: dynamoNamespace: vllm-v0-disagg-planner envFromSecret: hf-token-secret replicas: 1 resources: requests: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" limits: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo - serve - graphs.disagg_planner:VllmWorker - --system-app-port - "5000" - --enable-system-app - --use-default-health-checks - --service-name - VllmWorker PrefillWorker: dynamoNamespace: vllm-v0-disagg-planner envFromSecret: hf-token-secret replicas: 1 resources: requests: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" limits: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo - serve - graphs.disagg_planner:PrefillWorker - --system-app-port - "5000" - --enable-system-app - --use-default-health-checks - --service-name - PrefillWorker Planner: dynamoNamespace: vllm-v0-disagg-planner replicas: 1 componentType: planner resources: requests: cpu: "2" memory: "2Gi" limits: cpu: "2" memory: "2Gi" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo - serve - graphs.disagg_planner:Planner - --system-app-port - "5000" - --enable-system-app - --use-default-health-checks - --service-name - Planner - --Planner.environment=kubernetes Prometheus: dynamoNamespace: vllm-v0-disagg-planner replicas: 1 resources: requests: cpu: "1000m" memory: "1000Mi" limits: cpu: "1000m" memory: "1000Mi" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo - serve - graphs.disagg_planner:Prometheus - --system-app-port - "5000" - --enable-system-app - --use-default-health-checks - --service-name - Prometheus