nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
  name: example-llm-sla
spec:
  # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
  model: Qwen/Qwen3-0.6B

  # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
  backend: trtllm

  # ProfilerImage is the container image to use for profiling jobs (required)
  profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"

  # ProfilingConfig maps directly to the profile_sla.py config format
  # See benchmarks/profiler/utils/profiler_argparse.py for complete schema
  # Note: deployment.model and engine.backend are automatically set from model and backend above
  profilingConfig:
    config:
      # Optional: Output directory for profiling results (defaults to /data in the Job)
      # output_dir: "profiling_results"

      # Engine configuration
      engine:
        max_context_length: 16384  # will override max context length of the model if provided

      # Hardware configuration
      hardware:
        min_num_gpus_per_engine: 1  # Minimum GPUs to test
        max_num_gpus_per_engine: 4  # Maximum GPUs to test (limited by model's num_heads/4)
        num_gpus_per_node: 8  # GPUs per node (for MoE models)

      # Sweep/profiling configuration
      sweep:
        prefill_interpolation_granularity: 16  # Samples for TTFT interpolation
        decode_interpolation_granularity: 6  # Samples for ITL interpolation

        # AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
        use_ai_configurator: false  # Set to false for online profiling (2-4 hours)
        aic_system: h200_sxm  # Target GPU system for AI Configurator
        aic_hf_id: Qwen/Qwen3-0.6B  # HuggingFace model ID for AI Configurator
        aic_backend_version: "0.20.0"  # Backend version for AI Configurator

      # SLA targets for profiling
      sla:
        isl: 3000  # Input sequence length
        osl: 500   # Output sequence length
        ttft: 50.0  # Time To First Token target (milliseconds)
        itl: 10.0   # Inter-Token Latency target (milliseconds)

      # Optional: Planner-specific arguments
      # planner:
      #   planner_min_endpoint: 2
      #   # Add any other planner args here (use hyphens or underscores)

    # Reference to ConfigMap containing the DGD base config (disagg.yaml)
    # The path to this file will be automatically set as engine.config
    configMapRef:
      name: my-profiling-config
      key: disagg.yaml  # defaults to "disagg.yaml"

  # Optional: Automatically create DynamoGraphDeployment after profiling
  autoApply: true  # default is false

  # Optional: Override metadata for auto-created DGD (only used when autoApply: true)
  # deploymentOverrides:
  #   name: my-custom-dgd-name
  #   namespace: production
  #   labels:
  #     team: ml-platform
  #   annotations:
  #     description: "Auto-generated from DGDR"