deploy.yaml 2.98 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
  name: llama3-70b-disagg-mn
spec:
  backendFramework: vllm
9
10
11
  pvcs:
    - name: model-cache
      create: false
12
13
14
  services:
    Frontend:
      componentType: frontend
15
16
      volumeMounts:
        - name: model-cache
17
          mountPoint: /opt/models
18
19
      extraPodSpec:
        mainContainer:
20
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
21
          workingDir: /workspace/examples/backends/vllm
22
23
24
      envs:
        - name: HF_HOME
          value: /opt/models
25
26
27
      replicas: 1
    VllmPrefillWorker:
      componentType: worker
28
      subComponentType: prefill
29
      envFromSecret: hf-token-secret
30
31
      volumeMounts:
        - name: model-cache
32
          mountPoint: /opt/models
33
34
35
36
      sharedMemory:
        size: 80Gi
      extraPodSpec:
        mainContainer:
37
38
39
40
          env:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
41
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
42
43
            - name: HF_HOME
              value: /opt/models
44
          args:
45
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disaggregation-mode prefill --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
46
47
48
          command:
          - /bin/sh
          - -c
49
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
50
          workingDir: /workspace/examples/backends/vllm
51
52
53
54
55
56
57
58
      replicas: 1
      resources:
        limits:
          gpu: "8"
        requests:
          gpu: "8"
    VllmDecodeWorker:
      componentType: worker
59
      subComponentType: decode
60
      envFromSecret: hf-token-secret
61
62
      volumeMounts:
        - name: model-cache
63
          mountPoint: /opt/models
64
65
66
67
      sharedMemory:
        size: 80Gi
      extraPodSpec:
        mainContainer:
68
69
70
71
          env:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
72
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
73
74
            - name: HF_HOME
              value: /opt/models
75
          args:
76
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
77
78
79
          command:
          - /bin/sh
          - -c
80
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
81
          workingDir: /workspace/examples/backends/vllm
82
83
84
85
86
87
      replicas: 1
      resources:
        limits:
          gpu: "8"
        requests:
          gpu: "8"