# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: agg-8xtp2 spec: pvcs: - create: false name: model-cache - create: false name: compilation-cache services: Frontend: componentType: frontend envs: - name: HF_HOME value: /home/dynamo/.cache/huggingface extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0 workingDir: /workspace command: - python3 - -m - dynamo.frontend args: - --router-reset-states replicas: 1 resources: requests: cpu: "8" limits: cpu: "8" subComponentType: null VllmDecodeWorker: componentType: worker envFromSecret: hf-token-secret volumeMounts: - name: model-cache mountPoint: /home/dynamo/.cache/huggingface - name: compilation-cache mountPoint: /home/dynamo/.cache/vllm useAsCompilationCache: true extraPodSpec: mainContainer: args: - --model - Qwen/Qwen3-32B - --tensor-parallel-size - '2' - --disable-log-requests - --gpu-memory-utilization - '0.90' - --async-scheduling - --block-size - '64' - --hf-overrides - '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768},"max_position_embeddings":131072}' - --max-model-len - '131072' command: - python3 - -m - dynamo.vllm image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0 env: - name: DYN_HEALTH_CHECK_ENABLED value: "false" - name: HF_HOME value: /home/dynamo/.cache/huggingface workingDir: /workspace replicas: 8 resources: limits: gpu: '2' custom: rdma/ib: "2" requests: gpu: '2' subComponentType: decode