nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml 3.63 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
  name: example-llm-sla
spec:
21
22
23
24
25
26
27
  # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
  model: Qwen/Qwen3-0.6B

  # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
  backend: trtllm

  # ProfilerImage is the container image to use for profiling jobs (required)
28
  profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
29
30

  # ProfilingConfig maps directly to the profile_sla.py config format
31
  # See dynamo/profiler/utils/profiler_argparse.py for complete schema
32
  # Note: deployment.model and engine.backend are automatically set from model and backend above
33
34
35
36
37
38
39
  profilingConfig:
    config:
      # Optional: Output directory for profiling results (defaults to /data in the Job)
      # output_dir: "profiling_results"

      # Engine configuration
      engine:
40
        maxContextLength: 16384  # will override max context length of the model if provided
41

42
43
44
      # Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
      searchStrategy: thorough

45
      # Hardware configuration
46
      # Note: Operator auto-discovers GPU info from cluster nodes when available
47
      hardware:
48
        minNumGpusPerEngine: 1  # Minimum GPUs to test
49
50
51
52
53
        maxNumGpusPerEngine: 4  # Maximum GPUs to test
        numGpusPerNode: 8       # GPUs per node (optional - auto-discovered if not specified)
        system: h200_sxm        # Hardware system (optional - auto-detected if not specified)
        # gpuModel: "H200-SXM"  # GPU model (optional - auto-discovered)
        # gpuVramMib: 141557    # GPU VRAM in MiB (optional - auto-discovered)
54
55
56

      # Sweep/profiling configuration
      sweep:
57
        prefillInterpolationGranularity: 16  # Samples for TTFT interpolation
58
        decodeInterpolationGranularity: 6    # Samples for ITL interpolation
59
60
61
62
63
64
65
66
67
68

      # SLA targets for profiling
      sla:
        isl: 3000  # Input sequence length
        osl: 500   # Output sequence length
        ttft: 50.0  # Time To First Token target (milliseconds)
        itl: 10.0   # Inter-Token Latency target (milliseconds)

      # Optional: Planner-specific arguments
      # planner:
69
70
      #   plannerMinEndpoint: 2
      #   # Add any other planner args here
71
72
73
74
75
76

    # Reference to ConfigMap containing the DGD base config (disagg.yaml)
    # The path to this file will be automatically set as engine.config
    configMapRef:
      name: my-profiling-config
      key: disagg.yaml  # defaults to "disagg.yaml"
77
78
79
80
81
82
83
84
85
86
87
88

  # Optional: Automatically create DynamoGraphDeployment after profiling
  autoApply: true  # default is false

  # Optional: Override metadata for auto-created DGD (only used when autoApply: true)
  # deploymentOverrides:
  #   name: my-custom-dgd-name
  #   namespace: production
  #   labels:
  #     team: ml-platform
  #   annotations:
  #     description: "Auto-generated from DGDR"