disagg_planner.yaml 6.66 KB
Newer Older
1
2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Alec's avatar
Alec committed
3

4
5
6
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
7
  name: vllm-disagg-planner
8
spec:
9
10
11
12
13
14
15
  envs:
    - name: DYNAMO_SERVICE_CONFIG
      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
    - name: DYNAMO_PORT
      value: "8000"
    - name: DYNAMO_NAMESPACE
      value: "vllm-disagg-planner"
16
17
  services:
    Frontend:
18
      dynamoNamespace: vllm-disagg-planner
19
20
      componentType: main
      replicas: 1
21
22
23
24
      livenessProbe:
        httpGet:
          path: /health
          port: 8000
25
26
27
28
        initialDelaySeconds: 20
        periodSeconds: 5
        timeoutSeconds: 5
        failureThreshold: 3
29
30
31
32
33
34
35
36
37
38
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
39
40
      resources:
        requests:
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
          cpu: "32"
          memory: "10Gi"
        limits:
          cpu: "32"
          memory: "10Gi"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
            - -c
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
    Planner:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
82
          memory: "2Gi"
83
        limits:
84
          cpu: "2"
85
          memory: "2Gi"
86
87
88
89
      pvc:
        create: false
        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
        mountPoint: /workspace/profiling_results
90
91
      extraPodSpec:
        mainContainer:
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
          workingDir: /workspace/components/planner/src/dynamo/planner
          args:
            - python
            - -m
            - planner_sla
            - --environment=kubernetes
            - --backend=vllm
            - --adjustment-interval=60
            - --profile-results-dir=/workspace/profiling_results
    Prometheus:
      dynamoNamespace: vllm-disagg-planner
      componentType: main
      replicas: 1
      envs:
        - name: PYTHONPATH
          value: "/workspace/components/planner/src"
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 30
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
          memory: "2Gi"
        limits:
          cpu: "2"
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
Alec's avatar
Alec committed
138
          workingDir: /workspace/components/backends/vllm
139
140
141
          command:
            - /bin/sh
            - -c
142
          args:
143
            - "python3 -m dynamo.planner.prometheus"
144
    VllmDecodeWorker:
145
      dynamoNamespace: vllm-disagg-planner
146
      envFromSecret: hf-token-secret
147
      componentType: worker
148
      replicas: 2
149
      livenessProbe:
150
151
152
153
        httpGet:
          path: /live
          port: 9090
        periodSeconds: 5
154
        timeoutSeconds: 30
155
        failureThreshold: 1
156
      readinessProbe:
157
158
159
160
        httpGet:
          path: /health
          port: 9090
        periodSeconds: 10
161
        timeoutSeconds: 30
162
        failureThreshold: 60
163
164
      resources:
        requests:
165
166
          cpu: "8"
          memory: "16Gi"
167
          gpu: "1"
168
        limits:
169
170
          cpu: "8"
          memory: "16Gi"
171
          gpu: "1"
172
173
174
175
176
177
178
      envs:
        - name: DYN_SYSTEM_ENABLED
          value: "true"
        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
          value: "[\"generate\"]"
        - name: DYN_SYSTEM_PORT
          value: "9090"
179
180
      extraPodSpec:
        mainContainer:
181
182
183
184
185
186
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
187
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
Alec's avatar
Alec committed
188
          workingDir: /workspace/components/backends/vllm
189
190
191
          command:
            - /bin/sh
            - -c
192
          args:
193
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
194
    VllmPrefillWorker:
195
      dynamoNamespace: vllm-disagg-planner
196
      envFromSecret: hf-token-secret
197
      componentType: worker
198
      replicas: 2
199
      livenessProbe:
200
201
202
203
        httpGet:
          path: /health
          port: 9090
        periodSeconds: 5
204
        timeoutSeconds: 30
205
        failureThreshold: 1
206
      readinessProbe:
207
208
209
210
        httpGet:
          path: /health
          port: 9090
        periodSeconds: 10
211
        timeoutSeconds: 30
212
        failureThreshold: 60
213
214
      resources:
        requests:
215
216
          cpu: "8"
          memory: "16Gi"
217
          gpu: "1"
218
        limits:
219
220
          cpu: "8"
          memory: "16Gi"
221
          gpu: "1"
222
223
224
225
226
227
228
      envs:
        - name: DYN_SYSTEM_ENABLED
          value: "true"
        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
          value: "[\"generate\"]"
        - name: DYN_SYSTEM_PORT
          value: "9090"
229
230
      extraPodSpec:
        mainContainer:
231
232
233
234
235
236
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
237
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
Alec's avatar
Alec committed
238
          workingDir: /workspace/components/backends/vllm
239
240
241
          command:
            - /bin/sh
            - -c
242
          args:
243
            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log