# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentRequest metadata: name: example-llm-sla spec: # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model) model: Qwen/Qwen3-0.6B # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend) backend: trtllm # ProfilerImage is the container image to use for profiling jobs (required) profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0" # ProfilingConfig maps directly to the profile_sla.py config format # See dynamo/profiler/utils/profiler_argparse.py for complete schema # Note: deployment.model and engine.backend are automatically set from model and backend above profilingConfig: config: # Optional: Output directory for profiling results (defaults to /data in the Job) # output_dir: "profiling_results" # Engine configuration engine: maxContextLength: 16384 # will override max context length of the model if provided # Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h) searchStrategy: thorough # Hardware configuration # Note: Operator auto-discovers GPU info from cluster nodes when available hardware: minNumGpusPerEngine: 1 # Minimum GPUs to test maxNumGpusPerEngine: 4 # Maximum GPUs to test numGpusPerNode: 8 # GPUs per node (optional - auto-discovered if not specified) system: h200_sxm # Hardware system (optional - auto-detected if not specified) # gpuModel: "H200-SXM" # GPU model (optional - auto-discovered) # gpuVramMib: 141557 # GPU VRAM in MiB (optional - auto-discovered) # Sweep/profiling configuration sweep: prefillInterpolationGranularity: 16 # Samples for TTFT interpolation decodeInterpolationGranularity: 6 # Samples for ITL interpolation # SLA targets for profiling sla: isl: 3000 # Input sequence length osl: 500 # Output sequence length ttft: 50.0 # Time To First Token target (milliseconds) itl: 10.0 # Inter-Token Latency target (milliseconds) # Optional: Planner-specific arguments # planner: # plannerMinEndpoint: 2 # # Add any other planner args here # Reference to ConfigMap containing the DGD base config (disagg.yaml) # The path to this file will be automatically set as engine.config configMapRef: name: my-profiling-config key: disagg.yaml # defaults to "disagg.yaml" # Optional: Automatically create DynamoGraphDeployment after profiling autoApply: true # default is false # Optional: Override metadata for auto-created DGD (only used when autoApply: true) # deploymentOverrides: # name: my-custom-dgd-name # namespace: production # labels: # team: ml-platform # annotations: # description: "Auto-generated from DGDR"