nvidia.com_dynamographdeploymentrequests.yaml 25.5 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  annotations:
    controller-gen.kubebuilder.io/version: v0.16.4
    helm.sh/resource-policy: keep
  name: dynamographdeploymentrequests.nvidia.com
spec:
  group: nvidia.com
  names:
    kind: DynamoGraphDeploymentRequest
    listKind: DynamoGraphDeploymentRequestList
    plural: dynamographdeploymentrequests
    shortNames:
      - dgdr
    singular: dynamographdeploymentrequest
  scope: Namespaced
  versions:
    - additionalPrinterColumns:
36
        - jsonPath: .spec.model
37
38
          name: Model
          type: string
39
        - jsonPath: .status.backend
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
          name: Backend
          type: string
        - jsonPath: .status.state
          name: State
          type: string
        - jsonPath: .status.deployment.state
          name: DGD-State
          type: string
        - jsonPath: .metadata.creationTimestamp
          name: Age
          type: date
      name: v1alpha1
      schema:
        openAPIV3Schema:
          description: |-
            DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
            It serves as the primary interface for users to request model deployments with
            specific performance and resource constraints, enabling SLA-driven deployments.

            Lifecycle:
             1. Initial → Pending: Validates spec and prepares for profiling
             2. Pending → Profiling: Creates and runs profiling job (online or AIC)
             3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
             4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
             5. Ready: Terminal state when DGD is operational or spec is available
             6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted

            The spec becomes immutable once profiling starts. Users must delete and recreate
            the DGDR to modify configuration after this point.
          properties:
            apiVersion:
              description: |-
                APIVersion defines the versioned schema of this representation of an object.
                Servers should convert recognized schemas to the latest internal value, and
                may reject unrecognized values.
                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
              type: string
            kind:
              description: |-
                Kind is a string value representing the REST resource this object represents.
                Servers may infer this from the endpoint the client submits requests to.
                Cannot be updated.
                In CamelCase.
                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
              type: string
            metadata:
              type: object
            spec:
              description: Spec defines the desired state for this deployment request.
              properties:
                autoApply:
                  default: false
                  description: |-
                    AutoApply indicates whether to automatically create a DynamoGraphDeployment
                    after profiling completes. If false, only the spec is generated and stored in status.
                    Users can then manually create a DGD using the generated spec.
                  type: boolean
97
98
                backend:
                  description: |-
99
                    Backend specifies the inference backend for profiling.
100
                    The controller automatically sets this value in profilingConfig.config.engine.backend.
101
                    Profiling runs on real GPUs or via AIC simulation to collect performance data.
102
103
104
105
106
                  enum:
                    - vllm
                    - sglang
                    - trtllm
                  type: string
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
                deploymentOverrides:
                  description: |-
                    DeploymentOverrides allows customizing metadata for the auto-created DGD.
                    Only applicable when AutoApply is true.
                  properties:
                    annotations:
                      additionalProperties:
                        type: string
                      description: Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
                      type: object
                    labels:
                      additionalProperties:
                        type: string
                      description: |-
                        Labels are additional labels to add to the DynamoGraphDeployment metadata.
                        These are merged with auto-generated labels from the profiling process.
                      type: object
                    name:
                      description: |-
                        Name is the desired name for the created DynamoGraphDeployment.
                        If not specified, defaults to the DGDR name.
                      type: string
                    namespace:
                      description: |-
                        Namespace is the desired namespace for the created DynamoGraphDeployment.
                        If not specified, defaults to the DGDR namespace.
                      type: string
134
135
136
137
138
139
140
                    workersImage:
                      description: |-
                        WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
                        This image is used for both temporary DGDs created during online profiling and the final DGD.
                        If omitted, the image from the base config file (e.g., disagg.yaml) is used.
                        Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
                      type: string
141
                  type: object
142
                enableGpuDiscovery:
143
                  default: true
144
                  description: |-
145
146
147
148
149
                    EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
                    DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
                    attempted automatically. Setting this field has no effect - the operator will always try to discover
                    GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
                    operator without permissions), manual hardware configuration is required regardless of this setting.
150
                  type: boolean
151
                model:
152
                  description: |-
153
                    Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
154
                    This is a high-level identifier for easy reference in kubectl output and logs.
155
                    The controller automatically sets this value in profilingConfig.config.deployment.model.
156
157
158
                  type: string
                profilingConfig:
                  description: |-
159
                    ProfilingConfig provides the complete configuration for the profiling job.
160
161
162
163
164
165
                    Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
                    cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
                    discovered GPU configuration is used as defaults when hardware configuration is not manually
                    specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
                    always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
                    namespace-restricted operator without node permissions), manual hardware config is required.
166
167
                    This configuration is passed directly to the profiler.
                    The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
168
169
                    Note: deployment.model and engine.backend are automatically set from the high-level
                    modelName and backend fields and should not be specified in this config.
170
                  properties:
171
172
173
174
175
176
                    config:
                      description: |-
                        Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
                        The profiler will validate the configuration and report any errors.
                      type: object
                      x-kubernetes-preserve-unknown-fields: true
177
178
                    configMapRef:
                      description: |-
179
180
181
                        ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
                        base config file (disagg.yaml). This is separate from the profiling config above.
                        The path to this config will be set as engine.config in the profiling config.
182
183
184
185
186
187
188
189
190
191
192
                      properties:
                        key:
                          default: disagg.yaml
                          description: Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
                          type: string
                        name:
                          description: Name of the ConfigMap containing the desired data.
                          type: string
                      required:
                        - name
                      type: object
193
194
195
196
197
198
199
                    nodeSelector:
                      additionalProperties:
                        type: string
                      description: |-
                        NodeSelector is a selector which must match a node's labels for the profiling pod to be scheduled on that node.
                        For example, to schedule on ARM64 nodes, use {"kubernetes.io/arch": "arm64"}.
                      type: object
200
201
202
203
204
205
206
207
208
209
                    outputPVC:
                      description: |-
                        OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
                        If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
                        to this PVC instead of an ephemeral emptyDir volume. This allows users to access
                        complete profiling results after the job completes by mounting the PVC.
                        The PVC must exist in the same namespace as the DGDR.
                        If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
                        Note: ConfigMaps are still created regardless of this setting for planner integration.
                      type: string
210
211
212
213
214
215
                    profilerImage:
                      description: |-
                        ProfilerImage specifies the container image to use for profiling jobs.
                        This image contains the profiler code and dependencies needed for SLA-based profiling.
                        Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
                      type: string
216
217
218
219
220
221
222
223
224
225
                    resources:
                      description: |-
                        Resources specifies the compute resource requirements for the profiling job container.
                        If not specified, no resource requests or limits are set.
                      properties:
                        claims:
                          description: |-
                            Claims lists the names of resources, defined in spec.resourceClaims,
                            that are used by this container.

226
                            This field depends on the
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
                            DynamicResourceAllocation feature gate.

                            This field is immutable. It can only be set for containers.
                          items:
                            description: ResourceClaim references one entry in PodSpec.ResourceClaims.
                            properties:
                              name:
                                description: |-
                                  Name must match the name of one entry in pod.spec.resourceClaims of
                                  the Pod where this field is used. It makes that resource available
                                  inside a container.
                                type: string
                              request:
                                description: |-
                                  Request is the name chosen for a request in the referenced claim.
                                  If empty, everything from the claim is made available, otherwise
                                  only the result of this request.
                                type: string
                            required:
                              - name
                            type: object
                          type: array
                          x-kubernetes-list-map-keys:
                            - name
                          x-kubernetes-list-type: map
                        limits:
                          additionalProperties:
                            anyOf:
                              - type: integer
                              - type: string
                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                            x-kubernetes-int-or-string: true
                          description: |-
                            Limits describes the maximum amount of compute resources allowed.
                            More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                          type: object
                        requests:
                          additionalProperties:
                            anyOf:
                              - type: integer
                              - type: string
                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                            x-kubernetes-int-or-string: true
                          description: |-
                            Requests describes the minimum amount of compute resources required.
                            If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
                            otherwise to an implementation-defined value. Requests cannot exceed Limits.
                            More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                          type: object
                      type: object
                    tolerations:
                      description: |-
                        Tolerations allows the profiling job to be scheduled on nodes with matching taints.
                        For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint.
                      items:
                        description: |-
                          The pod this Toleration is attached to tolerates any taint that matches
                          the triple <key,value,effect> using the matching operator <operator>.
                        properties:
                          effect:
                            description: |-
                              Effect indicates the taint effect to match. Empty means match all taint effects.
                              When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
                            type: string
                          key:
                            description: |-
                              Key is the taint key that the toleration applies to. Empty means match all taint keys.
                              If the key is empty, operator must be Exists; this combination means to match all values and all keys.
                            type: string
                          operator:
                            description: |-
                              Operator represents a key's relationship to the value.
                              Valid operators are Exists and Equal. Defaults to Equal.
                              Exists is equivalent to wildcard for value, so that a pod can
                              tolerate all taints of a particular category.
                            type: string
                          tolerationSeconds:
                            description: |-
                              TolerationSeconds represents the period of time the toleration (which must be
                              of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
                              it is not set, which means tolerate the taint forever (do not evict). Zero and
                              negative values will be treated as 0 (evict immediately) by the system.
                            format: int64
                            type: integer
                          value:
                            description: |-
                              Value is the taint value the toleration matches to.
                              If the operator is Exists, the value should be empty, otherwise just a regular string.
                            type: string
                        type: object
                      type: array
318
319
                  required:
                    - profilerImage
320
                  type: object
321
322
323
324
325
326
327
328
329
                useMocker:
                  default: false
                  description: |-
                    UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of
                    a real backend deployment. When true, the deployment uses simulated engines that
                    don't require GPUs, using the profiling data to simulate realistic timing behavior.
                    Mocker is available in all backend images and useful for large-scale experiments.
                    Profiling still runs against the real backend (specified above) to collect performance data.
                  type: boolean
330
              required:
331
332
                - backend
                - model
333
                - profilingConfig
334
335
336
337
              type: object
            status:
              description: Status reflects the current observed state of this deployment request.
              properties:
338
339
340
341
342
                backend:
                  description: |-
                    Backend is extracted from profilingConfig.config.engine.backend for display purposes.
                    This field is populated by the controller and shown in kubectl output.
                  type: string
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
                conditions:
                  description: |-
                    Conditions contains the latest observed conditions of the deployment request.
                    Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
                    Conditions are merged by type on patch updates.
                  items:
                    description: Condition contains details for one aspect of the current state of this API Resource.
                    properties:
                      lastTransitionTime:
                        description: |-
                          lastTransitionTime is the last time the condition transitioned from one status to another.
                          This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
                        format: date-time
                        type: string
                      message:
                        description: |-
                          message is a human readable message indicating details about the transition.
                          This may be an empty string.
                        maxLength: 32768
                        type: string
                      observedGeneration:
                        description: |-
                          observedGeneration represents the .metadata.generation that the condition was set based upon.
                          For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
                          with respect to the current state of the instance.
                        format: int64
                        minimum: 0
                        type: integer
                      reason:
                        description: |-
                          reason contains a programmatic identifier indicating the reason for the condition's last transition.
                          Producers of specific condition types may define expected values and meanings for this field,
                          and whether the values are considered a guaranteed API.
                          The value should be a CamelCase string.
                          This field may not be empty.
                        maxLength: 1024
                        minLength: 1
                        pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
                        type: string
                      status:
                        description: status of the condition, one of True, False, Unknown.
                        enum:
                          - "True"
                          - "False"
                          - Unknown
                        type: string
                      type:
                        description: type of condition in CamelCase or in foo.example.com/CamelCase.
                        maxLength: 316
                        pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
                        type: string
                    required:
                      - lastTransitionTime
                      - message
                      - reason
                      - status
                      - type
                    type: object
                  type: array
                deployment:
                  description: |-
                    Deployment tracks the auto-created DGD when AutoApply is true.
                    Contains name, namespace, state, and creation status of the managed DGD.
                  properties:
                    created:
                      description: |-
                        Created indicates whether the DGD has been successfully created.
                        Used to prevent recreation if the DGD is manually deleted by users.
                      type: boolean
                    name:
                      description: Name is the name of the created DynamoGraphDeployment.
                      type: string
                    namespace:
                      description: Namespace is the namespace of the created DynamoGraphDeployment.
                      type: string
                    state:
                      description: |-
                        State is the current state of the DynamoGraphDeployment.
                        This value is mirrored from the DGD's status.state field.
                      type: string
                  type: object
                generatedDeployment:
                  description: |-
                    GeneratedDeployment contains the full generated DynamoGraphDeployment specification
                    including metadata, based on profiling results. Users can extract this to create
                    a DGD manually, or it's used automatically when autoApply is true.
                    Stored as RawExtension to preserve all fields including metadata.
430
                    For mocker backends, this contains the mocker DGD spec.
431
432
433
434
435
436
437
438
439
440
441
442
                  type: object
                  x-kubernetes-embedded-resource: true
                  x-kubernetes-preserve-unknown-fields: true
                observedGeneration:
                  description: |-
                    ObservedGeneration reflects the generation of the most recently observed spec.
                    Used to detect spec changes and enforce immutability after profiling starts.
                  format: int64
                  type: integer
                profilingResults:
                  description: |-
                    ProfilingResults contains a reference to the ConfigMap holding profiling data.
443
                    Format: "configmap/\<name\>"
444
445
446
447
448
449
450
451
452
453
454
455
456
                  type: string
                state:
                  description: |-
                    State is a high-level textual status of the deployment request lifecycle.
                    Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
                    Empty string ("") represents the initial state before initialization.
                  type: string
              type: object
          type: object
      served: true
      storage: true
      subresources:
        status: {}