nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
  name: example-llm-sla
spec:
  # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
  model: Qwen/Qwen3-0.6B

  # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
  backend: trtllm

  # ProfilerImage is the container image to use for profiling jobs (required)
  profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"

  # ProfilingConfig maps directly to the profile_sla.py config format
  # See dynamo/profiler/utils/profiler_argparse.py for complete schema
  # Note: deployment.model and engine.backend are automatically set from model and backend above
  profilingConfig:
    config:
      # Optional: Output directory for profiling results (defaults to /data in the Job)
      # output_dir: "profiling_results"

      # Engine configuration
      engine:
        maxContextLength: 16384  # will override max context length of the model if provided

      # Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
      searchStrategy: thorough

      # Hardware configuration
      # Note: Operator auto-discovers GPU info from cluster nodes when available
      hardware:
        minNumGpusPerEngine: 1  # Minimum GPUs to test
        maxNumGpusPerEngine: 4  # Maximum GPUs to test
        numGpusPerNode: 8       # GPUs per node (optional - auto-discovered if not specified)
        system: h200_sxm        # Hardware system (optional - auto-detected if not specified)
        # gpuModel: "H200-SXM"  # GPU model (optional - auto-discovered)
        # gpuVramMib: 141557    # GPU VRAM in MiB (optional - auto-discovered)

      # Sweep/profiling configuration
      sweep:
        prefillInterpolationGranularity: 16  # Samples for TTFT interpolation
        decodeInterpolationGranularity: 6    # Samples for ITL interpolation

      # SLA targets for profiling
      sla:
        isl: 3000  # Input sequence length
        osl: 500   # Output sequence length
        ttft: 50.0  # Time To First Token target (milliseconds)
        itl: 10.0   # Inter-Token Latency target (milliseconds)

      # Optional: Planner-specific arguments
      # planner:
      #   plannerMinEndpoint: 2
      #   # Add any other planner args here

    # Reference to ConfigMap containing the DGD base config (disagg.yaml)
    # The path to this file will be automatically set as engine.config
    configMapRef:
      name: my-profiling-config
      key: disagg.yaml  # defaults to "disagg.yaml"

  # Optional: Automatically create DynamoGraphDeployment after profiling
  autoApply: true  # default is false

  # Optional: Override metadata for auto-created DGD (only used when autoApply: true)
  # deploymentOverrides:
  #   name: my-custom-dgd-name
  #   namespace: production
  #   labels:
  #     team: ml-platform
  #   annotations:
  #     description: "Auto-generated from DGDR"