disagg_planner.yaml 5.53 KB
Newer Older
1
2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Alec's avatar
Alec committed
3

4
5
6
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
7
  name: vllm-disagg-planner
8
9
  annotations:
    nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
10
spec:
11
12
13
14
15
  envs:
    - name: DYNAMO_SERVICE_CONFIG
      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
    - name: DYNAMO_NAMESPACE
      value: "vllm-disagg-planner"
16
17
  services:
    Frontend:
18
      dynamoNamespace: vllm-disagg-planner
19
      componentType: frontend
20
21
22
      replicas: 1
      resources:
        requests:
23
24
25
26
27
28
29
          cpu: "32"
          memory: "10Gi"
        limits:
          cpu: "32"
          memory: "10Gi"
      extraPodSpec:
        mainContainer:
30
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
            - -c
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
    Planner:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
64
          memory: "2Gi"
65
        limits:
66
          cpu: "2"
67
          memory: "2Gi"
68
69
70
71
      pvc:
        create: false
        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
        mountPoint: /workspace/profiling_results
72
73
      extraPodSpec:
        mainContainer:
74
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
75
          workingDir: /workspace/components/planner/src/dynamo/planner
76
77
78
          ports:
            - name: metrics
              containerPort: 9085
julienmancuso's avatar
julienmancuso committed
79
80
81
          command:
            - /bin/sh
            - -c
82
          args:
julienmancuso's avatar
julienmancuso committed
83
84
85
86
87
88
            - >-
              python3 -m planner_sla
              --environment=kubernetes
              --backend=vllm
              --adjustment-interval=60
              --profile-results-dir=/workspace/profiling_results
89
              --prometheus-port=9085
90
    Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
91
      dynamoNamespace: vllm-disagg-planner
92
      componentType: frontend
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
      replicas: 1
      envs:
        - name: PYTHONPATH
          value: "/workspace/components/planner/src"
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        initialDelaySeconds: 30
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "2"
          memory: "2Gi"
        limits:
          cpu: "2"
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
125
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
126
          workingDir: /workspace/components/backends/vllm
127
128
129
          command:
            - /bin/sh
            - -c
130
          args:
131
            - "python3 -m dynamo.planner.prometheus"
132
    VllmDecodeWorker:
133
      dynamoNamespace: vllm-disagg-planner
134
      envFromSecret: hf-token-secret
135
      componentType: worker
136
      replicas: 2
137
138
      resources:
        requests:
139
140
          cpu: "8"
          memory: "16Gi"
141
          gpu: "1"
142
        limits:
143
144
          cpu: "8"
          memory: "16Gi"
145
          gpu: "1"
146
147
      extraPodSpec:
        mainContainer:
148
149
150
151
152
153
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
154
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
155
          workingDir: /workspace/components/backends/vllm
156
157
158
          command:
            - /bin/sh
            - -c
159
          args:
160
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3"
161
    VllmPrefillWorker:
162
      dynamoNamespace: vllm-disagg-planner
163
      envFromSecret: hf-token-secret
164
      componentType: worker
165
      replicas: 2
166
167
      resources:
        requests:
168
169
          cpu: "8"
          memory: "16Gi"
170
          gpu: "1"
171
        limits:
172
173
          cpu: "8"
          memory: "16Gi"
174
          gpu: "1"
175
176
      extraPodSpec:
        mainContainer:
177
178
179
180
181
182
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
183
          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
Alec's avatar
Alec committed
184
          workingDir: /workspace/components/backends/vllm
185
186
187
          command:
            - /bin/sh
            - -c
188
          args:
189
            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3