disagg_planner.yaml 5.57 KB
Newer Older
1
2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Alec's avatar
Alec committed
3

4
5
6
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
7
  name: vllm-disagg-planner
8
9
  annotations:
    nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
10
spec:
11
12
13
14
15
16
17
  envs:
    - name: DYNAMO_SERVICE_CONFIG
      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
    - name: DYNAMO_PORT
      value: "8000"
    - name: DYNAMO_NAMESPACE
      value: "vllm-disagg-planner"
18
19
  services:
    Frontend:
20
      dynamoNamespace: vllm-disagg-planner
21
      componentType: frontend
22
23
24
      replicas: 1
      resources:
        requests:
25
26
27
28
29
30
31
          cpu: "32"
          memory: "10Gi"
        limits:
          cpu: "32"
          memory: "10Gi"
      extraPodSpec:
        mainContainer:
32
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
            - -c
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
    Planner:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
66
          memory: "2Gi"
67
        limits:
68
          cpu: "2"
69
          memory: "2Gi"
70
71
72
73
      pvc:
        create: false
        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
        mountPoint: /workspace/profiling_results
74
75
      extraPodSpec:
        mainContainer:
76
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
77
          workingDir: /workspace/components/planner/src/dynamo/planner
78
79
80
          ports:
            - name: metrics
              containerPort: 9085
julienmancuso's avatar
julienmancuso committed
81
82
83
          command:
            - /bin/sh
            - -c
84
          args:
julienmancuso's avatar
julienmancuso committed
85
86
87
88
89
90
            - >-
              python3 -m planner_sla
              --environment=kubernetes
              --backend=vllm
              --adjustment-interval=60
              --profile-results-dir=/workspace/profiling_results
91
              --prometheus-port=9085
92
    Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
93
      dynamoNamespace: vllm-disagg-planner
94
      componentType: frontend
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
      replicas: 1
      envs:
        - name: PYTHONPATH
          value: "/workspace/components/planner/src"
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 30
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
          memory: "2Gi"
        limits:
          cpu: "2"
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
127
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
128
          workingDir: /workspace/components/backends/vllm
129
130
131
          command:
            - /bin/sh
            - -c
132
          args:
133
            - "python3 -m dynamo.planner.prometheus"
134
    VllmDecodeWorker:
135
      dynamoNamespace: vllm-disagg-planner
136
      envFromSecret: hf-token-secret
137
      componentType: worker
138
      replicas: 2
139
140
      resources:
        requests:
141
142
          cpu: "8"
          memory: "16Gi"
143
          gpu: "1"
144
        limits:
145
146
          cpu: "8"
          memory: "16Gi"
147
          gpu: "1"
148
149
      extraPodSpec:
        mainContainer:
150
151
152
153
154
155
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
156
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
157
          workingDir: /workspace/components/backends/vllm
158
159
160
          command:
            - /bin/sh
            - -c
161
          args:
162
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3"
163
    VllmPrefillWorker:
164
      dynamoNamespace: vllm-disagg-planner
165
      envFromSecret: hf-token-secret
166
      componentType: worker
167
      replicas: 2
168
169
      resources:
        requests:
170
171
          cpu: "8"
          memory: "16Gi"
172
          gpu: "1"
173
        limits:
174
175
          cpu: "8"
          memory: "16Gi"
176
          gpu: "1"
177
178
      extraPodSpec:
        mainContainer:
179
180
181
182
183
184
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
185
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
186
          workingDir: /workspace/components/backends/vllm
187
188
189
          command:
            - /bin/sh
            - -c
190
          args:
191
            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3