# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentRequest metadata: name: example-llm-sla spec: # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model) model: Qwen/Qwen3-0.6B # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend) backend: trtllm # ProfilerImage is the container image to use for profiling jobs (required) profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1" # ProfilingConfig maps directly to the profile_sla.py config format # See benchmarks/profiler/utils/profiler_argparse.py for complete schema # Note: deployment.model and engine.backend are automatically set from model and backend above profilingConfig: config: # Optional: Output directory for profiling results (defaults to /data in the Job) # output_dir: "profiling_results" # Engine configuration engine: maxContextLength: 16384 # will override max context length of the model if provided # Hardware configuration hardware: minNumGpusPerEngine: 1 # Minimum GPUs to test maxNumGpusPerEngine: 4 # Maximum GPUs to test (limited by model's num_heads/4) numGpusPerNode: 8 # GPUs per node (for MoE models) # Sweep/profiling configuration sweep: prefillInterpolationGranularity: 16 # Samples for TTFT interpolation decodeInterpolationGranularity: 6 # Samples for ITL interpolation # AI Configurator mode (fast simulation-based profiling, 20-30 seconds) useAiConfigurator: false # Set to false for online profiling (2-4 hours) aicSystem: h200_sxm # Target GPU system for AI Configurator aicHfId: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator aicBackendVersion: "0.20.0" # Backend version for AI Configurator # SLA targets for profiling sla: isl: 3000 # Input sequence length osl: 500 # Output sequence length ttft: 50.0 # Time To First Token target (milliseconds) itl: 10.0 # Inter-Token Latency target (milliseconds) # Optional: Planner-specific arguments # planner: # plannerMinEndpoint: 2 # # Add any other planner args here # Reference to ConfigMap containing the DGD base config (disagg.yaml) # The path to this file will be automatically set as engine.config configMapRef: name: my-profiling-config key: disagg.yaml # defaults to "disagg.yaml" # Optional: Automatically create DynamoGraphDeployment after profiling autoApply: true # default is false # Optional: Override metadata for auto-created DGD (only used when autoApply: true) # deploymentOverrides: # name: my-custom-dgd-name # namespace: production # labels: # team: ml-platform # annotations: # description: "Auto-generated from DGDR"