# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: llama3-70b-disagg-mn spec: backendFramework: vllm pvcs: - name: model-cache create: false services: Frontend: componentType: frontend dynamoNamespace: llama3-70b-disagg-mn volumeMounts: - name: model-cache mountPoint: /root/.cache/huggingface extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag workingDir: /workspace/components/backends/vllm replicas: 1 VllmPrefillWorker: componentType: worker dynamoNamespace: llama3-70b-disagg-mn envFromSecret: hf-token-secret volumeMounts: - name: model-cache mountPoint: /root/.cache/huggingface sharedMemory: size: 80Gi extraPodSpec: mainContainer: args: - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag workingDir: /workspace/components/backends/vllm replicas: 1 resources: limits: gpu: "8" requests: gpu: "8" VllmDecodeWorker: componentType: worker dynamoNamespace: llama3-70b-disagg-mn envFromSecret: hf-token-secret volumeMounts: - name: model-cache mountPoint: /root/.cache/huggingface sharedMemory: size: 80Gi extraPodSpec: mainContainer: args: - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag workingDir: /workspace/components/backends/vllm replicas: 1 resources: limits: gpu: "8" requests: gpu: "8"