Unverified Commit d56439ec authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...


feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 233a1e9a
......@@ -28,6 +28,7 @@ import (
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
......@@ -128,65 +129,19 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "enableGpuDiscovery true for cluster-wide operator",
name: "namespace-restricted operator (GPU discovery will fail gracefully)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "enableGpuDiscovery true for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
},
{
name: "enableGpuDiscovery false for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: false,
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Raw: []byte(validConfigWithHardware),
},
},
},
......@@ -263,16 +218,15 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)",
name: "multiple errors (missing profiler image and missing config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: nil,
......@@ -281,9 +235,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators",
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
errContains: true,
},
// TODO: Add test for invalid GPU range (min > max) validation
// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
// but needs proper test coverage
}
for _, tt := range tests {
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import logging
import re
import shutil
import subprocess
import time
import uuid
from dataclasses import asdict, dataclass
from typing import Dict, List, Optional, Tuple, Union
from kubernetes import client, config
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def run_command(cmd: List[str], capture_output: bool = True, exit_on_error: bool = True): # type: ignore
try:
return subprocess.run(cmd, capture_output=capture_output, text=True, check=True)
except subprocess.CalledProcessError as e: # pragma: no cover - passthrough
if exit_on_error:
logger.error(f"Command failed: {' '.join(cmd)}")
if e.stdout:
logger.error(e.stdout)
if e.stderr:
logger.error(e.stderr)
raise RuntimeError(f"Command failed: {' '.join(cmd)}")
raise
NVIDIA_PREFIX = "nvidia.com/"
LABEL_GPU_COUNT = f"{NVIDIA_PREFIX}gpu.count"
LABEL_GPU_PRODUCT = f"{NVIDIA_PREFIX}gpu.product"
LABEL_GPU_MEMORY = f"{NVIDIA_PREFIX}gpu.memory" # MiB per GPU
LABEL_MIG_CAPABLE = f"{NVIDIA_PREFIX}mig.capable"
@dataclass
class NodeGpuInventory:
node_name: str
gpu_count: Optional[int]
gpu_product: Optional[str]
gpu_memory_mib: Optional[int]
mig_capable: Optional[bool]
allocatable_gpu: Optional[int]
mig_resources: Dict[str, str]
def to_dict(self) -> Dict[str, Union[str, int, bool, Dict[str, str], None]]:
return asdict(self)
def _parse_int(value: Optional[str]) -> Optional[int]:
if value is None:
return None
try:
return int(value)
except (TypeError, ValueError):
match = re.search(r"\d+", str(value))
return int(match.group(0)) if match else None
def _bool_from_str(value: Optional[str]) -> Optional[bool]:
if value is None:
return None
s = str(value).strip().lower()
if s in {"true", "1", "yes"}:
return True
if s in {"false", "0", "no"}:
return False
return None
def _normalize_node(node: Union[client.V1Node, Dict]) -> Dict:
# Convert V1Node to dict for uniform access
if hasattr(node, "to_dict"):
return node.to_dict()
return node # assume already dict
def _extract_inventory(node_obj: Dict) -> NodeGpuInventory:
meta = node_obj.get("metadata", {})
status = node_obj.get("status", {})
labels = meta.get("labels", {}) or {}
node_name = meta.get("name", "<unknown>")
gpu_product = labels.get(LABEL_GPU_PRODUCT)
gpu_memory_mib = _parse_int(labels.get(LABEL_GPU_MEMORY))
mig_capable = _bool_from_str(labels.get(LABEL_MIG_CAPABLE))
# Prefer GFD-reported GPU count if present; otherwise use allocatable nvidia.com/gpu
gpu_count = _parse_int(labels.get(LABEL_GPU_COUNT))
alloc = status.get("allocatable", {}) or {}
alloc_gpu = _parse_int(alloc.get(f"{NVIDIA_PREFIX}gpu"))
if gpu_count is None:
gpu_count = alloc_gpu
# Collect MIG resource keys and counts if present
mig_resources: Dict[str, str] = {
k: str(v)
for k, v in alloc.items()
if isinstance(k, str)
and k.startswith(f"{NVIDIA_PREFIX}mig-")
and _parse_int(str(v))
}
return NodeGpuInventory(
node_name=node_name,
gpu_count=gpu_count,
gpu_product=gpu_product,
gpu_memory_mib=gpu_memory_mib,
mig_capable=mig_capable,
allocatable_gpu=alloc_gpu,
mig_resources=mig_resources,
)
def _list_nodes_via_client() -> List[Dict]:
# Assume running inside a Kubernetes pod with service account
try:
config.load_incluster_config()
except Exception as e:
raise RuntimeError(
f"Failed to load in-cluster Kubernetes config. Ensure this runs in a pod with a service account. Error: {e}"
)
v1 = client.CoreV1Api()
items = v1.list_node().items # type: ignore[attr-defined]
return [_normalize_node(n) for n in items]
def _list_nodes_via_kubectl() -> List[Dict]:
if not shutil.which("kubectl"):
raise RuntimeError("kubectl not found in PATH for fallback")
result = run_command(["kubectl", "get", "nodes", "-o", "json"], capture_output=True)
data = json.loads(result.stdout)
return data.get("items", [])
def collect_gpu_inventory(
prefer_client: bool = True,
) -> Tuple[List[NodeGpuInventory], str]:
sources_tried: List[str] = []
errors: List[str] = []
def _via_client() -> List[NodeGpuInventory]:
items = _list_nodes_via_client()
return [_extract_inventory(n) for n in items]
def _via_kubectl() -> List[NodeGpuInventory]:
items = _list_nodes_via_kubectl()
return [_extract_inventory(n) for n in items]
if prefer_client:
try:
sources_tried.append("kubernetes-client")
return _via_client(), ",".join(sources_tried)
except Exception as e:
errors.append(str(e))
try:
sources_tried.append("kubectl-json")
return _via_kubectl(), ",".join(sources_tried)
except Exception as e2:
errors.append(str(e2))
raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
else:
try:
sources_tried.append("kubectl-json")
return _via_kubectl(), ",".join(sources_tried)
except Exception as e:
errors.append(str(e))
try:
sources_tried.append("kubernetes-client")
return _via_client(), ",".join(sources_tried)
except Exception as e2:
errors.append(str(e2))
raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
def _format_gib(mib: Optional[int]) -> str:
if mib is None:
return ""
return f"{mib/1024:.1f} GiB"
def print_table(rows: List[NodeGpuInventory], show_mig: bool = False) -> None:
headers = ["NODE", "GPUS", "MODEL", "VRAM/GPU", "MIG"]
table: List[List[str]] = []
for r in rows:
mig_str = ""
if r.mig_capable is True:
if r.mig_resources:
mig_str = ",".join(
f"{k.split('/')[-1]}={v}"
for k, v in sorted(r.mig_resources.items())
)
else:
mig_str = "capable"
elif r.mig_capable is False:
mig_str = "no"
table.append(
[
r.node_name,
"" if r.gpu_count is None else str(r.gpu_count),
r.gpu_product or "",
_format_gib(r.gpu_memory_mib),
mig_str if show_mig else ("yes" if r.mig_capable else ""),
]
)
# Compute column widths
widths = [len(h) for h in headers]
for row in table:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(cell))
def _fmt_row(row: List[str]) -> str:
return " ".join(cell.ljust(widths[i]) for i, cell in enumerate(row))
logger.info(_fmt_row(headers))
logger.info(_fmt_row(["-" * w for w in widths]))
for row in table:
logger.info(_fmt_row(row))
def aggregate_valued_rows(
rows: List[NodeGpuInventory],
) -> Tuple[Optional[NodeGpuInventory], int]:
"""Aggregate rows that have meaningful GPU metadata.
Preference order when multiple distinct values exist:
1) Larger GPUs per node (gpu_count)
2) Larger VRAM per GPU (gpu_memory_mib)
Returns (selected_row_like, distinct_count).
"""
valued: List[NodeGpuInventory] = [
r for r in rows if (r.gpu_product is not None or r.gpu_memory_mib is not None)
]
if not valued:
return None, 0
# Group by (product, vram_mib)
from collections import defaultdict
groups: Dict[
Tuple[Optional[str], Optional[int]],
Dict[str, object],
] = defaultdict(lambda: {"max_gpu": 0, "rows": []})
for r in valued:
key = (r.gpu_product, r.gpu_memory_mib)
meta = groups[key]
meta["rows"].append(r) # type: ignore[attr-defined, index]
# Use known gpu_count if available for ranking
if r.gpu_count is not None:
meta["max_gpu"] = max(int(meta["max_gpu"]), int(r.gpu_count)) # type: ignore[arg-type, call-overload, index]
def sort_key(
item: Tuple[
Tuple[Optional[str], Optional[int]],
Dict[str, object],
]
):
(prod, mem_mib), meta = item
max_gpu = int(meta["max_gpu"]) # type: ignore[arg-type, call-overload, index]
mem_val = mem_mib if mem_mib is not None else -1
return (max_gpu, mem_val)
selected_key, selected_meta = sorted(groups.items(), key=sort_key, reverse=True)[0]
sel_prod, sel_mem_mib = selected_key
sel_gpu = int(selected_meta["max_gpu"]) # type: ignore[arg-type, call-overload, index]
selected = NodeGpuInventory(
node_name="<aggregate>",
gpu_count=sel_gpu if sel_gpu > 0 else None,
gpu_product=sel_prod,
gpu_memory_mib=sel_mem_mib,
mig_capable=None,
allocatable_gpu=None,
mig_resources={},
)
return selected, len(groups)
def _get_current_namespace(default: str = "default") -> str:
try:
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
return f.read().strip() or default
except Exception:
return default
def enrich_with_smi(
rows: List[NodeGpuInventory],
namespace: Optional[str] = None,
timeout_seconds: int = 180,
) -> None:
"""For nodes missing product/memory labels, schedule a short-lived pod on each node
that requests 1 GPU and runs nvidia-smi to capture model and memory.
Requires permissions: create/get/delete pods and get pods/log in the namespace.
"""
ns = namespace or _get_current_namespace()
try:
config.load_incluster_config()
except Exception:
pass
v1 = client.CoreV1Api()
for inv in rows:
if not inv.gpu_count or (
inv.gpu_product is not None and inv.gpu_memory_mib is not None
):
continue
pod_name = f"gpu-inv-smi-{uuid.uuid4().hex[:6]}"
container = client.V1Container(
name="smi",
image="nvidia/cuda:12.3.2-base-ubuntu22.04",
command=["bash", "-lc"],
args=[
"nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits"
],
resources=client.V1ResourceRequirements(
limits={"nvidia.com/gpu": "1", "cpu": "100m", "memory": "128Mi"},
requests={"nvidia.com/gpu": "1", "cpu": "50m", "memory": "64Mi"},
),
)
pod = client.V1Pod(
api_version="v1",
kind="Pod",
metadata=client.V1ObjectMeta(name=pod_name, namespace=ns),
spec=client.V1PodSpec(
restart_policy="Never",
node_name=inv.node_name,
containers=[container],
),
)
logs = ""
try:
v1.create_namespaced_pod(namespace=ns, body=pod)
start = time.time()
while time.time() - start < timeout_seconds:
p = v1.read_namespaced_pod(name=pod_name, namespace=ns)
phase = (p.status.phase or "").lower()
if phase in ("succeeded", "failed"):
break
time.sleep(2)
try:
logs = v1.read_namespaced_pod_log(name=pod_name, namespace=ns)
except Exception:
logs = ""
finally:
try:
v1.delete_namespaced_pod(
name=pod_name, namespace=ns, body=client.V1DeleteOptions()
)
except Exception:
pass
for line in logs.splitlines():
parts = [x.strip() for x in line.split(",")]
if len(parts) >= 2 and parts[0]:
inv.gpu_product = inv.gpu_product or parts[0]
mem_match = re.search(r"\d+", parts[1])
if mem_match:
inv.gpu_memory_mib = inv.gpu_memory_mib or int(mem_match.group(0))
break
def get_gpu_summary(
prefer_client: bool = True, enrich_smi: bool = True
) -> Dict[str, object]:
"""Return an aggregate GPU summary for the cluster.
Selection policy when multiple values exist: prefer higher GPUs per node,
then higher VRAM/GPU. Returns dict with keys: gpus_per_node, model, vram.
If model/VRAM unavailable anywhere, returns {"gpus_per_node": max_gpus, "model": "", "vram": 0}.
"""
# TODO: use proper tools (i.e., DCGM) to get GPU inventory
rows, _ = collect_gpu_inventory(prefer_client=prefer_client)
if enrich_smi:
enrich_with_smi(rows)
agg, _distinct = aggregate_valued_rows(rows)
if agg is None:
# Fallback to max GPUs only
max_gpus = 0
for r in rows:
if r.gpu_count is not None:
max_gpus = max(max_gpus, int(r.gpu_count))
return {"gpus_per_node": max_gpus, "model": "", "vram": 0}
gpus_val = int(agg.gpu_count) if agg.gpu_count is not None else 0
model_val = agg.gpu_product or ""
vram_val = int(agg.gpu_memory_mib) if agg.gpu_memory_mib is not None else 0
return {
"gpus_per_node": gpus_val,
"model": model_val,
"vram": vram_val,
}
def main() -> None:
parser = argparse.ArgumentParser(
description="Report GPU inventory per Kubernetes node (count, SKU, VRAM)."
)
parser.add_argument(
"--format",
"-o",
choices=["table", "json"],
default="table",
help="Output format",
)
parser.add_argument(
"--prefer",
choices=["client", "kubectl"],
default="client",
help="Prefer Kubernetes Python client or kubectl JSON fallback",
)
parser.add_argument(
"--show-mig",
action="store_true",
help="In table output, show MIG resource types and counts",
)
parser.add_argument(
"--enrich-smi",
action="store_true",
help="Schedule short-lived pods per node to fetch model/VRAM via nvidia-smi",
)
parser.add_argument(
"--aggregate",
action="store_true",
help="Print a single representative (GPUs per node, MODEL, VRAM/GPU). Warn if multiple values exist",
)
args = parser.parse_args()
prefer_client = args.prefer == "client"
rows, source = collect_gpu_inventory(prefer_client=prefer_client)
if args.enrich_smi:
enrich_with_smi(rows)
if args.format == "json":
payload = {
"source": source,
"items": [r.to_dict() for r in rows],
}
logger.info(json.dumps(payload, indent=2))
return
# Table output
print_table(rows, show_mig=args.show_mig)
if args.aggregate:
agg, distinct = aggregate_valued_rows(rows)
if agg is None:
logger.warning("No nodes expose MODEL/VRAM; cannot aggregate")
return
if distinct > 1:
logger.warning(
f"Multiple distinct GPU model/VRAM pairs detected across nodes: {distinct}. Showing highest GPUs per node, then highest VRAM/GPU."
)
# Print concise aggregate line
model = agg.gpu_product or ""
vram = _format_gib(agg.gpu_memory_mib)
gpus = agg.gpu_count if agg.gpu_count is not None else ""
logger.info(f"Aggregate => GPUS: {gpus} MODEL: {model} VRAM/GPU: {vram}")
if __name__ == "__main__":
main()
<!--
SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# Profiler Guide
This guide covers deployment, configuration, integration, and troubleshooting for the Dynamo Profiler.
## What is a DynamoGraphDeploymentRequest (DGDR)?
A **DynamoGraphDeploymentRequest (DGDR)** is a Kubernetes Custom Resource that serves as the primary interface for users to request model deployments with specific performance and resource constraints. You specify:
- **What** model you want to deploy (`model`)
- **How** it should perform (SLA targets: `ttft`, `itl`)
- **Where** it should run (optional GPU preferences)
- **Which** backend to use (`backend`: vllm, sglang, or trtllm)
- **Which** images to use (`profilingConfig.profilerImage`, `deploymentOverrides.workersImage`)
The Dynamo Operator watches for DGDRs and automatically:
1. Discovers available GPU resources in your cluster
2. Runs profiling (online or offline) to find optimal configurations
3. Generates an optimized DynamoGraphDeployment (DGD) configuration
4. Deploys the DGD to your cluster
**Relationship to DGD:**
- **DGDR**: High-level "intent" - what you want deployed
- **DGD**: Low-level "implementation" - how it's deployed
## Support Matrix
| Backend | Dense Models | MoE Models |
|---------|-------------|------------|
| vLLM | ✅ | 🚧 |
| SGLang | ✅ | ✅ |
| TensorRT-LLM | ✅ | 🚧 |
The profiler sweeps over the following parallelization mappings for prefill and decode:
| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping |
|---------|-------------|------------|
| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP |
| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP |
| Other Models | TP | TP |
> [!NOTE]
> Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.
## Deployment
### Kubernetes Deployment (DGDR)
The recommended deployment method is through DGDRs. Sample configurations are provided in `benchmarks/profiler/deploy/`:
| Sample | Description |
|--------|-------------|
| `profile_sla_dgdr.yaml` | Standard online profiling with AIPerf |
| `profile_sla_aic_dgdr.yaml` | Fast offline profiling with AI Configurator |
| `profile_sla_moe_dgdr.yaml` | MoE model profiling (SGLang) |
#### Container Images
Each DGDR requires container images for profiling and deployment:
- **`profilingConfig.profilerImage`** (Required): Container image for the profiling job. Must contain the profiler code and dependencies.
- **`deploymentOverrides.workersImage`** (Optional): Container image for DGD worker components (frontend, workers, planner). If omitted, uses image from the base config file.
```yaml
spec:
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
```
#### Quick Start: Deploy with DGDR
**Step 1: Create Your DGDR**
Use a sample configuration or create your own:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: my-model-profiling
spec:
model: "Qwen/Qwen3-0.6B"
backend: vllm
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
config:
sla:
isl: 3000
osl: 150
ttft: 200.0
itl: 20.0
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
autoApply: true
```
**Step 2: Apply the DGDR**
```bash
export NAMESPACE=your-namespace
kubectl apply -f my-profiling-dgdr.yaml -n $NAMESPACE
```
**Step 3: Monitor Progress**
```bash
# View status
kubectl get dgdr -n $NAMESPACE
# Detailed status
kubectl describe dgdr my-model-profiling -n $NAMESPACE
# Watch profiling job logs
kubectl logs -f job/profile-my-model-profiling -n $NAMESPACE
```
**DGDR Status States:**
- `Pending`: Initial state, preparing to profile
- `Profiling`: Running profiling job (20-30 seconds for AIC, 2-4 hours for online)
- `Deploying`: Generating and applying DGD configuration
- `Ready`: DGD successfully deployed and running
- `Failed`: Error occurred (check events for details)
**Step 4: Access Your Deployment**
```bash
# Find the frontend service
kubectl get svc -n $NAMESPACE | grep frontend
# Port-forward to access locally
kubectl port-forward svc/<deployment>-frontend 8000:8000 -n $NAMESPACE
# Test the endpoint
curl http://localhost:8000/v1/models
```
> [!NOTE]
> DGDRs are **immutable**. To update SLAs or configuration, delete the existing DGDR and create a new one.
### Direct Script Execution
For advanced use cases or local development:
```bash
python -m benchmarks.profiler.profile_sla \
--backend vllm \
--config path/to/disagg.yaml \
--model meta-llama/Llama-3-8B \
--ttft 200 --itl 15 \
--isl 3000 --osl 150 \
--min-num-gpus 1 \
--max-num-gpus 8
```
## Profiling Method
The profiler follows a 5-step process:
1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense models and 4 nodes for MoE models.
3. **Parallelization Mapping Sweep**: Test performance of engines with different parallelization mappings using the input ISL and OSL.
- For dense models, test different TP sizes for both prefill and decode.
- For MoE models (SGLang), evaluate both TEP and DEP as candidates for prefill and decode.
- **Prefill**:
- TP/TEP: Measure TTFT with batch size = 1 (assuming ISL is long enough to saturate compute) without KV reuse.
- DEP: Attention uses data parallelism. Send a single burst with total concurrency `attention_dp_size × attn_dp_num_req_ratio` (defaults to 4) and compute the reported TTFT as `time_to_first_token.max / attn_dp_num_req_ratio` from the AIPerf summary of that burst.
![Prefill Performance](../../images/h100_prefill_performance.png)
- **Decode**: Measure the ITL under different numbers of in-flight requests, from 1 to the maximum the KV cache can hold. To measure ITL without being affected by piggy-backed prefill requests, the script enables KV-reuse and warms up the engine by issuing the same prompts before measuring.
![Decode Performance](../../images/h100_decode_performance.png)
4. **Recommendation**: Select optimal parallelization mapping for prefill and decode that achieves the highest per-GPU throughput while adhering to the SLA on TTFT and ITL.
5. **In-Depth Profiling on the Recommended P/D Engine**: Interpolate TTFT with ISL and ITL with active KV cache and decode context length for more accurate performance estimation.
![ITL Interpolation](../../images/pd_interpolation.png)
- **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1.
- **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths.
### AIPerf on Real Engines
Profiles your model by creating real test deployments in Kubernetes and measuring their performance.
- **Duration**: 2-4 hours
- **Accuracy**: Highest (real measurements)
- **GPU Requirements**: Full access to test different parallelization mappings
- **Backends**: vLLM, SGLang, TensorRT-LLM
```yaml
profilingConfig:
config:
sweep:
useAiConfigurator: false # Default
```
### AI Configurator Simulation
Uses performance simulation to rapidly estimate optimal configurations without running real deployments.
- **Duration**: 20-30 seconds
- **Accuracy**: Estimated (may have errors for unusual configurations)
- **GPU Requirements**: None
- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon)
```yaml
profilingConfig:
config:
sweep:
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0" # TRT-LLM version simulated by AIC
```
> [!NOTE]
> `aicBackendVersion` specifies the TensorRT-LLM version that AI Configurator simulates. See the [AI Configurator supported features](https://github.com/ai-dynamo/aiconfigurator#supported-features) for available versions.
**Currently supports:**
- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6)
- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM
- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more
See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) for the full list.
### Automatic GPU Discovery
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
- Hardware information (GPU model, VRAM, GPUs per node)
- Automatic calculation of profiling search space based on model size
- Hardware system identifier for AI Configurator integration
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
## Configuration
### DGDR Configuration Structure
All profiler configuration goes under `spec.profilingConfig.config`:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: my-deployment
spec:
model: "Qwen/Qwen3-0.6B"
backend: vllm
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
configMapRef: # Optional: base DGD config
name: my-config
key: disagg.yaml
config:
sla: { ... }
hardware: { ... }
sweep: { ... }
planner: { ... }
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
```
### SLA Configuration (Required)
```yaml
sla:
isl: 3000 # Average input sequence length (tokens)
osl: 150 # Average output sequence length (tokens)
ttft: 200.0 # Target Time To First Token (milliseconds)
itl: 20.0 # Target Inter-Token Latency (milliseconds)
```
- **ISL/OSL**: Based on your expected traffic patterns
- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine)
- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine)
- **Trade-offs**: Tighter SLAs require more GPU resources
### Hardware Configuration (Optional)
```yaml
hardware:
minNumGpusPerEngine: 2 # Auto-determined from model size and VRAM if not provided
maxNumGpusPerEngine: 8 # Maximum GPUs to test
numGpusPerNode: 8 # GPUs per node (for multi-node MoE)
gpuType: h200_sxm # GPU type hint (informational, auto-detected)
```
- **minNumGpusPerEngine**: Skip small TP sizes if your model is large
- **maxNumGpusPerEngine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **numGpusPerNode**: Determine the upper bound of GPUs per node for dense models and configure Grove for multi-node MoE engines
- **gpuType**: Informational only, auto-detected by the controller. For AI Configurator, use `aicSystem` in the [sweep configuration](#ai-configurator-configuration) instead
> [!TIP]
> If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
### Sweep Configuration (Optional)
```yaml
sweep:
useAiConfigurator: false # Use real profiling (default)
prefillInterpolationGranularity: 16 # Samples for prefill TTFT curve
decodeInterpolationGranularity: 6 # Samples for decode ITL curve
```
- **useAiConfigurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefillInterpolationGranularity**: Samples for prefill TTFT curve (lower = faster but less accurate)
- **decodeInterpolationGranularity**: Samples for decode ITL curve. Since ITL interpolation is 3D and takes longer, we default to fewer samples. Increasing this value may quadratically increase profiling time.
### AI Configurator Configuration
Required if `useAiConfigurator: true`:
```yaml
sweep:
useAiConfigurator: true
aicSystem: h200_sxm # h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
aicHfId: Qwen/Qwen3-32B # HuggingFace model ID
aicBackendVersion: "0.20.0" # TensorRT-LLM version
```
### Planner Configuration (Optional)
Pass arguments to the SLA planner:
```yaml
planner:
planner_min_endpoint: 2 # Minimum endpoints to maintain
planner_adjustment_interval: 60 # Adjustment interval (seconds)
planner_load_predictor: linear # Load prediction method
```
> [!NOTE]
> Planner arguments use `planner_` prefix. See the AI Configurator documentation for full list.
### Model Cache PVC (Advanced)
For large models, use a pre-populated PVC containing model weights instead of downloading from HuggingFace:
```yaml
deployment:
modelCache:
pvcName: "model-cache"
pvcPath: "hub/models--deepseek-ai--DeepSeek-R1"
mountPath: "/opt/model-cache"
```
Requirements:
- The PVC must exist in the same namespace as the DGDR
- The model weights must be accessible at `{mountPath}/{pvcPath}`
### Engine Configuration (Auto-configured)
The controller automatically injects these from high-level fields:
```yaml
# You specify:
spec:
model: "Qwen/Qwen3-0.6B"
backend: vllm
# Controller auto-injects:
profilingConfig:
config:
deployment:
model: "Qwen/Qwen3-0.6B"
engine:
backend: vllm
config: /path/to/configmap
```
You should **not** manually set `deployment.model` or `engine.backend` in `profilingConfig.config`.
### Using Existing DGD Configs (ConfigMap)
Reference an existing DGD config via ConfigMap:
```bash
kubectl create configmap my-config \
--from-file=disagg.yaml=/path/to/your/disagg.yaml \
--namespace $NAMESPACE \
--dry-run=client -o yaml | kubectl apply -f -
```
```yaml
profilingConfig:
configMapRef:
name: my-config
key: disagg.yaml
```
The profiler uses the DGD config as a **base template**, then optimizes it based on your SLA targets.
### CLI Arguments
| Argument | Type | Default | Description |
|----------|------|---------|-------------|
| `--backend` | string | - | Inference backend: vllm, sglang, trtllm |
| `--config` | string | - | Path to DGD YAML config file |
| `--model` | string | - | HuggingFace model ID |
| `--ttft` | float | - | Target TTFT in milliseconds |
| `--itl` | float | - | Target ITL in milliseconds |
| `--isl` | int | - | Average input sequence length |
| `--osl` | int | - | Average output sequence length |
| `--min-num-gpus` | int | auto | Minimum GPUs per engine |
| `--max-num-gpus` | int | 8 | Maximum GPUs per engine |
| `--use-ai-configurator` | flag | false | Use offline AI Configurator |
| `--pick-with-webui` | flag | false | Launch interactive WebUI |
| `--webui-port` | int | 8000 | Port for WebUI |
> [!NOTE]
> CLI arguments map to DGDR config fields: `--min-num-gpus` = `hardware.minNumGpusPerEngine`, `--max-num-gpus` = `hardware.maxNumGpusPerEngine`, `--use-ai-configurator` = `sweep.useAiConfigurator`. See [DGDR Configuration Structure](#dgdr-configuration-structure) for all field mappings.
## Integration
### With SLA Planner
The Profiler generates interpolation data that the SLA Planner uses for autoscaling decisions.
**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`):
- `prefill_isl`: 1D array of input sequence lengths tested
- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL
- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL
**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`):
- `max_kv_tokens`: Total KV tokens capacity in decode engine
- `x_kv_usage`: 1D array of active KV usage percentages [0, 1]
- `y_context_length`: 1D array of average context lengths tested
- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point
- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point
### With Dynamo Operator
When using DGDR, the Dynamo Operator:
1. Creates profiling jobs automatically
2. Stores profiling data in ConfigMaps (`planner-profile-data`)
3. Generates optimized DGD configurations
4. Deploys the DGD with SLA Planner integration
The generated DGD is tracked via labels:
```yaml
metadata:
labels:
dgdr.nvidia.com/name: my-deployment
dgdr.nvidia.com/namespace: your-namespace
```
### With Observability
Monitor profiling jobs:
```bash
kubectl logs -f job/profile-<dgdr-name> -n $NAMESPACE
kubectl describe dgdr <name> -n $NAMESPACE
```
## Advanced Topics
### Manual Deployment Control
Disable auto-deployment to review the generated DGD before applying:
```yaml
spec:
autoApply: false
```
Then manually extract and apply:
```bash
# Extract generated DGD from DGDR status
kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' | kubectl apply -f -
# Or save to file for review
kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml
```
### Mocker Deployment
Deploy a mocker deployment that simulates engines without GPUs:
```yaml
spec:
model: <model-name>
backend: trtllm
useMocker: true # Deploy mocker instead of real backend
autoApply: true
```
Profiling still runs against the real backend to collect performance data. The mocker uses this data to simulate realistic timing behavior. Useful for large-scale experiments, testing Planner behavior, and validating configurations.
### Accessing Profiling Artifacts
By default, profiling data is stored in ConfigMaps. For detailed artifacts (plots, logs, raw data), attach a PVC:
```yaml
profilingConfig:
outputPVC: "dynamo-pvc"
```
**ConfigMaps (always created):**
- `dgdr-output-<name>`: Generated DGD configuration
- `planner-profile-data`: Profiling data for Planner (JSON)
**PVC artifacts (optional):**
- Performance plots (PNGs)
- DGD configurations for each profiled deployment
- AIPerf profiling artifacts
- Raw profiling data (`.npz` files)
- Profiler logs
Access PVC results:
```bash
kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE
kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s
kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling-results
kubectl delete pod pvc-access-pod -n $NAMESPACE
```
### Output Performance Plots
The profiler generates plots to visualize performance data:
**Parallelization Mapping Sweep Plots:**
- `prefill_performance.png`: TTFT vs Parallelization Mapping size
- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests
**In-Depth Profiling Plots:**
- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL
- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL
- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length
- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length
## Runtime Profiling (SGLang)
SGLang workers expose profiling endpoints for runtime performance analysis:
```bash
# Start profiling
curl -X POST http://localhost:9090/engine/start_profile \
-H "Content-Type: application/json" \
-d '{"output_dir": "/tmp/profiler_output"}'
# Run inference requests...
# Stop profiling
curl -X POST http://localhost:9090/engine/stop_profile
```
View traces using Chrome's `chrome://tracing`, [Perfetto UI](https://ui.perfetto.dev/), or TensorBoard.
## Troubleshooting
### Profiling Takes Too Long
**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
```yaml
sweep:
useAiConfigurator: true
```
**Solution 2**: Reduce search space:
```yaml
hardware:
minNumGpusPerEngine: 4 # Skip TP1, TP2
maxNumGpusPerEngine: 8 # Don't test beyond TP8
```
### SLA Cannot Be Met
**Symptoms**: Profiler reports no configuration meets targets
**Solutions:**
1. Relax SLA targets (increase TTFT/ITL)
2. Add more GPU resources
3. Try a different backend
4. Use a smaller model
### AI Configurator: Attention Head Constraint Error
**Symptoms**: Profiling fails with error:
```text
AssertionError: num_heads <N> should be divisible by tp_size <M> and the division result should be >= 4
```
**Cause**: AI Configurator requires **≥4 attention heads per GPU**. Small models with few heads cannot use high TP sizes.
**Affected Models:**
- **Qwen3-0.6B** (16 heads): Max TP = 4
- **GPT-2** (12 heads): Max TP = 3
- Most models **<1B parameters**: May hit this constraint
**Solution**: Limit `maxNumGpusPerEngine`:
```yaml
hardware:
maxNumGpusPerEngine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
```
**Calculate Max TP**: `max_tp = num_attention_heads / 4`
> [!NOTE]
> This is an AI Configurator limitation. Online profiling doesn't have this constraint.
### Image Pull Errors
**Symptoms**: `ErrImagePull` or `ImagePullBackOff`
**Solution**: Ensure image pull secrets are configured:
```bash
kubectl create secret docker-registry nvcr-imagepullsecret \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password=<NGC_API_KEY> \
--namespace <your-namespace>
```
### Out of Memory During Profiling
**Symptoms**: OOM errors in profiling jobs
**Solutions:**
1. Reduce `gpu_memory_utilization` in engine config
2. Reduce `--max-context-length`
3. Skip larger TP configurations
4. Use fewer GPUs per test
### Unsupported Parallelization Mapping in Backend
**Symptoms**: Startup/runtime error in the backend (e.g., prime number of attention heads constraining TP to 1, or backend not supporting different TP sizes for prefill and decode).
**Solutions:**
1. Contact the backend to add support and bump backend version in Dynamo
2. Constrain the max and min number of GPUs per engine to the supported range
## See Also
- [DGDR Examples](../../../components/src/dynamo/profiler/deploy/) - Complete DGDR YAML examples
- [DGDR API Reference](/docs/kubernetes/api_reference.md) - DGDR specification
- [Profiler Arguments Reference](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/profiler/utils/profiler_argparse.py) - Full CLI reference
<!--
SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
> **⚠️ Important**: This documentation is automatically generated from source code.
> Do not edit this file directly.
# API Reference
## Packages
- [nvidia.com/v1alpha1](#nvidiacomv1alpha1)
## nvidia.com/v1alpha1
Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides
a high-level, SLA-driven interface for deploying machine learning models on Dynamo.
Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
### Resource Types
- [DynamoCheckpoint](#dynamocheckpoint)
- [DynamoComponentDeployment](#dynamocomponentdeployment)
- [DynamoGraphDeployment](#dynamographdeployment)
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
- [DynamoModel](#dynamomodel)
#### Autoscaling
Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Deprecated: This field is ignored. | | |
| `minReplicas` _integer_ | Deprecated: This field is ignored. | | |
| `maxReplicas` _integer_ | Deprecated: This field is ignored. | | |
| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. | | |
| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. | | |
#### CheckpointMode
_Underlying type:_ _string_
CheckpointMode defines how checkpoint creation is handled
_Validation:_
- Enum: [Auto Manual]
_Appears in:_
- [ServiceCheckpointConfig](#servicecheckpointconfig)
| Field | Description |
| --- | --- |
| `Auto` | CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR<br /> |
| `Manual` | CheckpointModeManual means the user must create the Checkpoint CR themselves<br /> |
#### ComponentKind
_Underlying type:_ _string_
ComponentKind represents the type of underlying Kubernetes resource.
_Validation:_
- Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet]
_Appears in:_
- [ServiceReplicaStatus](#servicereplicastatus)
| Field | Description |
| --- | --- |
| `PodClique` | ComponentKindPodClique represents a PodClique resource.<br /> |
| `PodCliqueScalingGroup` | ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.<br /> |
| `Deployment` | ComponentKindDeployment represents a Deployment resource.<br /> |
| `LeaderWorkerSet` | ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.<br /> |
#### ConfigMapKeySelector
ConfigMapKeySelector selects a specific key from a ConfigMap.
Used to reference external configuration data stored in ConfigMaps.
_Appears in:_
- [ProfilingConfigSpec](#profilingconfigspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\} <br /> |
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
#### DeploymentOverridesSpec
DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
When autoApply is enabled, these overrides are applied to the generated DGD resource.
_Appears in:_
- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: \{\} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: \{\} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: \{\} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\} <br /> |
| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\} <br /> |
#### DeploymentStatus
DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment.
This status is populated when autoApply is enabled and a DGD is created.
_Appears in:_
- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name is the name of the created DynamoGraphDeployment. | | |
| `namespace` _string_ | Namespace is the namespace of the created DynamoGraphDeployment. | | |
| `state` _string_ | State is the current state of the DynamoGraphDeployment.<br />This value is mirrored from the DGD's status.state field. | | |
| `created` _boolean_ | Created indicates whether the DGD has been successfully created.<br />Used to prevent recreation if the DGD is manually deleted by users. | | |
#### DynamoCheckpoint
DynamoCheckpoint is the Schema for the dynamocheckpoints API
It represents a container checkpoint that can be used to restore pods to a warm state
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoCheckpoint` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoCheckpointSpec](#dynamocheckpointspec)_ | | | |
| `status` _[DynamoCheckpointStatus](#dynamocheckpointstatus)_ | | | |
#### DynamoCheckpointIdentity
DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
Two checkpoints with the same identity hash are considered equivalent
_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
- [ServiceCheckpointConfig](#servicecheckpointconfig)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `model` _string_ | Model is the model identifier (e.g., "meta-llama/Llama-3-70B") | | Required: \{\} <br /> |
| `backendFramework` _string_ | BackendFramework is the runtime framework (vllm, sglang, trtllm) | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br />If not specified, version is not included in identity hash<br />This ensures checkpoint compatibility across Dynamo releases | | Optional: \{\} <br /> |
| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) | | Optional: \{\} <br /> |
| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length | | Minimum: 1 <br />Optional: \{\} <br /> |
| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br />Use for any framework-specific or custom parameters not covered above | | Optional: \{\} <br /> |
#### DynamoCheckpointJobConfig
DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed | | Required: \{\} <br /> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
#### DynamoCheckpointPhase
_Underlying type:_ _string_
DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
_Validation:_
- Enum: [Pending Creating Ready Failed]
_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
| Field | Description |
| --- | --- |
| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
#### DynamoCheckpointSpec
DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence | | Required: \{\} <br /> |
| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job | | Required: \{\} <br /> |
#### DynamoCheckpointStatus
DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle | | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints | | Optional: \{\} <br /> |
| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} | | Optional: \{\} <br /> |
| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint | | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\} <br /> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\} <br /> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\} <br /> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state | | Optional: \{\} <br /> |
#### DynamoCheckpointStorageType
_Underlying type:_ _string_
DynamoCheckpointStorageType defines the supported storage backends for checkpoints
_Validation:_
- Enum: [pvc s3 oci]
_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
#### DynamoComponentDeployment
DynamoComponentDeployment is the Schema for the dynamocomponentdeployments API
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoComponentDeployment` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)_ | Spec defines the desired state for this Dynamo component deployment. | | |
#### DynamoComponentDeploymentSharedSpec
_Appears in:_
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). | | |
| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. | | |
| `serviceName` _string_ | The name of the component | | |
| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | |
| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | |
| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\} <br /> |
| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | |
| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. | | |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | |
| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. | | |
| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | |
| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). | | |
| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery | | Optional: \{\} <br /> |
| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). | | |
| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. | | Optional: \{\} <br /> |
| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. | | Optional: \{\} <br /> |
| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | |
| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0 <br /> |
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\} <br /> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
#### DynamoComponentDeploymentSpec
DynamoComponentDeploymentSpec defines the desired state of DynamoComponentDeployment
_Appears in:_
- [DynamoComponentDeployment](#dynamocomponentdeployment)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") | | Enum: [sglang vllm trtllm] <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). | | |
| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. | | |
| `serviceName` _string_ | The name of the component | | |
| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | |
| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | |
| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\} <br /> |
| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | |
| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. | | |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | |
| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. | | |
| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | |
| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). | | |
| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery | | Optional: \{\} <br /> |
| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). | | |
| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. | | Optional: \{\} <br /> |
| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. | | Optional: \{\} <br /> |
| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | |
| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0 <br /> |
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\} <br /> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
#### DynamoGraphDeployment
DynamoGraphDeployment is the Schema for the dynamographdeployments API.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoGraphDeployment` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoGraphDeploymentSpec](#dynamographdeploymentspec)_ | Spec defines the desired state for this graph deployment. | | |
| `status` _[DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)_ | Status reflects the current observed state of this graph deployment. | | |
#### DynamoGraphDeploymentRequest
DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoGraphDeploymentRequest` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)_ | Spec defines the desired state for this deployment request. | | |
| `status` _[DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)_ | Status reflects the current observed state of this deployment request. | | |
#### DynamoGraphDeploymentRequestSpec
DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
This CRD serves as the primary interface for users to request model deployments with
specific performance constraints and resource requirements, enabling SLA-driven deployments.
_Appears in:_
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false | |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentRequestStatus
DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
The controller updates this status as the DGDR progresses through its lifecycle.
_Appears in:_
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: \{\} <br /> |
| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata.<br />For mocker backends, this contains the mocker DGD spec. | | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentScalingAdapter
DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services
within a DynamoGraphDeployment. It implements the Kubernetes scale
subresource, enabling integration with HPA, KEDA, and custom autoscalers.
The adapter acts as an intermediary between autoscalers and the DGD,
ensuring that only the adapter controller modifies the DGD's service replicas.
This prevents conflicts when multiple autoscaling mechanisms are in play.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ | | | |
| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ | | | |
#### DynamoGraphDeploymentScalingAdapterSpec
DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter
_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.<br />This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0 <br />Required: \{\} <br /> |
| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: \{\} <br /> |
#### DynamoGraphDeploymentScalingAdapterStatus
DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter
_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `replicas` _integer_ | Replicas is the current number of replicas for the target service.<br />This is synced from the DGD's service replicas and is required for the scale subresource. | | Optional: \{\} <br /> |
| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.<br />Required for HPA compatibility via the scale subresource. | | Optional: \{\} <br /> |
| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentServiceRef
DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment
_Appears in:_
- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1 <br />Required: \{\} <br /> |
| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1 <br />Required: \{\} <br /> |
#### DynamoGraphDeploymentSpec
DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment.
_Appears in:_
- [DynamoGraphDeployment](#dynamographdeployment)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100 <br />Optional: \{\} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25 <br />Optional: \{\} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: \{\} <br /> |
| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm] <br /> |
| `restart` _[Restart](#restart)_ | Restart specifies the restart policy for the graph deployment. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentStatus
DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
_Appears in:_
- [DynamoGraphDeployment](#dynamographdeployment)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. | | |
| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. | | Optional: \{\} <br /> |
| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. | | Optional: \{\} <br /> |
| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br />The map key is the service name from spec.services. | | Optional: \{\} <br /> |
#### DynamoModel
DynamoModel is the Schema for the dynamo models API
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoModel` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoModelSpec](#dynamomodelspec)_ | | | |
| `status` _[DynamoModelStatus](#dynamomodelstatus)_ | | | |
#### DynamoModelSpec
DynamoModelSpec defines the desired state of DynamoModel
_Appears in:_
- [DynamoModel](#dynamomodel)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: \{\} <br /> |
| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label<br />This is used to discover endpoints via headless services | | Required: \{\} <br /> |
| `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter] <br />Optional: \{\} <br /> |
| `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) | | Optional: \{\} <br /> |
#### DynamoModelStatus
DynamoModelStatus defines the observed state of DynamoModel
_Appears in:_
- [DynamoModel](#dynamomodel)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `endpoints` _[EndpointInfo](#endpointinfo) array_ | Endpoints is the current list of all endpoints for this model | | Optional: \{\} <br /> |
| `readyEndpoints` _integer_ | ReadyEndpoints is the count of endpoints that are ready | | |
| `totalEndpoints` _integer_ | TotalEndpoints is the total count of endpoints | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represents the latest available observations of the model's state | | Optional: \{\} <br /> |
#### EPPConfig
EPPConfig contains configuration for EPP (Endpoint Picker Plugin) components.
EPP is responsible for intelligent endpoint selection and KV-aware routing.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `configMapRef` _[ConfigMapKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#configmapkeyselector-v1-core)_ | ConfigMapRef references a user-provided ConfigMap containing EPP configuration.<br />The ConfigMap should contain EndpointPickerConfig YAML.<br />Mutually exclusive with Config. | | Optional: \{\} <br /> |
| `config` _[EndpointPickerConfig](#endpointpickerconfig)_ | Config allows specifying EPP EndpointPickerConfig directly as a structured object.<br />The operator will marshal this to YAML and create a ConfigMap automatically.<br />Mutually exclusive with ConfigMapRef.<br />One of ConfigMapRef or Config must be specified (no default configuration).<br />Uses the upstream type from github.com/kubernetes-sigs/gateway-api-inference-extension | | Type: object <br />Optional: \{\} <br /> |
#### EndpointInfo
EndpointInfo represents a single endpoint (pod) serving the model
_Appears in:_
- [DynamoModelStatus](#dynamomodelstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `address` _string_ | Address is the full address of the endpoint (e.g., "http://10.0.1.5:9090") | | |
| `podName` _string_ | PodName is the name of the pod serving this endpoint | | Optional: \{\} <br /> |
| `ready` _boolean_ | Ready indicates whether the endpoint is ready to serve traffic<br />For LoRA models: true if the POST /loras request succeeded with a 2xx status code<br />For base models: always false (no probing performed) | | |
#### ExtraPodMetadata
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `annotations` _object (keys:string, values:string)_ | | | |
| `labels` _object (keys:string, values:string)_ | | | |
#### ExtraPodSpec
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `mainContainer` _[Container](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#container-v1-core)_ | | | |
#### IngressSpec
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled exposes the component through an ingress or virtual service when true. | | |
| `host` _string_ | Host is the base host name to route external traffic to this component. | | |
| `useVirtualService` _boolean_ | UseVirtualService indicates whether to configure a service-mesh VirtualService instead of a standard Ingress. | | |
| `virtualServiceGateway` _string_ | VirtualServiceGateway optionally specifies the gateway name to attach the VirtualService to. | | |
| `hostPrefix` _string_ | HostPrefix is an optional prefix added before the host. | | |
| `annotations` _object (keys:string, values:string)_ | Annotations to set on the generated Ingress/VirtualService resources. | | |
| `labels` _object (keys:string, values:string)_ | Labels to set on the generated Ingress/VirtualService resources. | | |
| `tls` _[IngressTLSSpec](#ingresstlsspec)_ | TLS holds the TLS configuration used by the Ingress/VirtualService. | | |
| `hostSuffix` _string_ | HostSuffix is an optional suffix appended after the host. | | |
| `ingressControllerClassName` _string_ | IngressControllerClassName selects the ingress controller class (e.g., "nginx"). | | |
#### IngressTLSSpec
_Appears in:_
- [IngressSpec](#ingressspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `secretName` _string_ | SecretName is the name of a Kubernetes Secret containing the TLS certificate and key. | | |
#### ModelReference
ModelReference identifies a model served by this component
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: \{\} <br /> |
| `revision` _string_ | Revision is the model revision/version (optional) | | Optional: \{\} <br /> |
#### ModelSource
ModelSource defines the source location of a model
_Appears in:_
- [DynamoModelSpec](#dynamomodelspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the model source URI<br />Supported formats:<br />- S3: s3://bucket/path/to/model<br />- HuggingFace: hf://org/model@revision_sha | | Required: \{\} <br /> |
#### MultinodeSpec
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `nodeCount` _integer_ | Indicates the number of nodes to deploy for multinode components.<br />Total number of GPUs is NumberOfNodes * GPU limit.<br />Must be greater than 1. | 2 | Minimum: 2 <br /> |
#### PVC
_Appears in:_
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `create` _boolean_ | Create indicates to create a new PVC | | |
| `name` _string_ | Name is the name of the PVC | | Required: \{\} <br /> |
| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | |
#### ProfilingConfigSpec
ProfilingConfigSpec defines configuration for the profiling process.
This structure maps directly to the profile_sla.py config format.
See benchmarks/profiler/utils/profiler_argparse.py for the complete schema.
_Appears in:_
- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: \{\} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: \{\} <br /> |
| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\} <br /> |
| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.<br />If specified, all profiling artifacts (logs, plots, configs, raw data) will be written<br />to this PVC instead of an ephemeral emptyDir volume. This allows users to access<br />complete profiling results after the job completes by mounting the PVC.<br />The PVC must exist in the same namespace as the DGDR.<br />If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.<br />Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: \{\} <br /> |
| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.<br />If not specified, no resource requests or limits are set. | | Optional: \{\} <br /> |
| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.<br />For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: \{\} <br /> |
#### ResourceItem
_Appears in:_
- [Resources](#resources)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `cpu` _string_ | CPU specifies the CPU resource request/limit (e.g., "1000m", "2") | | |
| `memory` _string_ | Memory specifies the memory resource request/limit (e.g., "4Gi", "8Gi") | | |
| `gpu` _string_ | GPU indicates the number of GPUs to request.<br />Total number of GPUs is NumberOfNodes * GPU in case of multinode deployment. | | |
| `gpuType` _string_ | GPUType can specify a custom GPU type, e.g. "gpu.intel.com/xe"<br />By default if not specified, the GPU type is "nvidia.com/gpu" | | |
| `custom` _object (keys:string, values:string)_ | Custom specifies additional custom resource requests/limits | | |
#### Resources
Resources defines requested and limits for a component, including CPU, memory,
GPUs/devices, and any runtime-specific resources.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `requests` _[ResourceItem](#resourceitem)_ | Requests specifies the minimum resources required by the component | | |
| `limits` _[ResourceItem](#resourceitem)_ | Limits specifies the maximum resources allowed for the component | | |
| `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation | | |
#### Restart
_Appears in:_
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `id` _string_ | ID is an arbitrary string that triggers a restart when changed.<br />Any modification to this value will initiate a restart of the graph deployment according to the strategy. | | MinLength: 1 <br />Required: \{\} <br /> |
| `strategy` _[RestartStrategy](#restartstrategy)_ | Strategy specifies the restart strategy for the graph deployment. | | Optional: \{\} <br /> |
#### RestartPhase
_Underlying type:_ _string_
_Appears in:_
- [RestartStatus](#restartstatus)
| Field | Description |
| --- | --- |
| `Pending` | |
| `Restarting` | |
| `Completed` | |
| `Failed` | |
#### RestartStatus
RestartStatus contains the status of the restart of the graph deployment.
_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `observedID` _string_ | ObservedID is the restart ID that has been observed and is being processed.<br />Matches the Restart.ID field in the spec. | | |
| `phase` _[RestartPhase](#restartphase)_ | Phase is the phase of the restart. | | |
| `inProgress` _string array_ | InProgress contains the names of the services that are currently being restarted. | | Optional: \{\} <br /> |
#### RestartStrategy
_Appears in:_
- [Restart](#restart)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RestartStrategyType](#restartstrategytype)_ | Type specifies the restart strategy type. | Sequential | Enum: [Sequential Parallel] <br /> |
| `order` _string array_ | Order specifies the order in which the services should be restarted. | | Optional: \{\} <br /> |
#### RestartStrategyType
_Underlying type:_ _string_
_Appears in:_
- [RestartStrategy](#restartstrategy)
| Field | Description |
| --- | --- |
| `Sequential` | |
| `Parallel` | |
#### ScalingAdapter
ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
for replica management. When enabled, the DGDSA owns the replicas field and
external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br />When true, a DGDSA is created and owns the replicas field.<br />When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\} <br /> |
#### ServiceCheckpointConfig
ServiceCheckpointConfig configures checkpointing for a DGD service
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly | | Optional: \{\} <br /> |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified | | Optional: \{\} <br /> |
#### ServiceCheckpointStatus
ServiceCheckpointStatus contains checkpoint information for a single service.
_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR | | Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity | | Optional: \{\} <br /> |
| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use | | Optional: \{\} <br /> |
#### ServiceReplicaStatus
ServiceReplicaStatus contains replica information for a single service.
_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `componentKind` _[ComponentKind](#componentkind)_ | ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet"). | | Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet] <br /> |
| `componentName` _string_ | ComponentName is the name of the underlying resource. | | |
| `replicas` _integer_ | Replicas is the total number of non-terminated replicas.<br />Required for all component kinds. | | Minimum: 0 <br /> |
| `updatedReplicas` _integer_ | UpdatedReplicas is the number of replicas at the current/desired revision.<br />Required for all component kinds. | | Minimum: 0 <br /> |
| `readyReplicas` _integer_ | ReadyReplicas is the number of ready replicas.<br />Populated for PodClique, Deployment, and LeaderWorkerSet.<br />Not available for PodCliqueScalingGroup.<br />When nil, the field is omitted from the API response. | | Minimum: 0 <br />Optional: \{\} <br /> |
| `availableReplicas` _integer_ | AvailableReplicas is the number of available replicas.<br />For Deployment: replicas ready for >= minReadySeconds.<br />For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.<br />Not available for PodClique or LeaderWorkerSet.<br />When nil, the field is omitted from the API response. | | Minimum: 0 <br />Optional: \{\} <br /> |
#### SharedMemorySpec
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `disabled` _boolean_ | | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | |
#### VolumeMount
VolumeMount references a PVC defined at the top level for volumes to be mounted by the component
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\} <br /> |
| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. | | |
| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false | |
# Operator Default Values Injection
The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include:
- **Health Probes**: Startup, liveness, and readiness probes are configured differently for frontend, worker, and planner components. For example, worker components receive a startup probe with a 2-hour timeout (720 failures × 10 seconds) to accommodate long model loading times.
- **Security Context**: All components receive `fsGroup: 1000` by default to ensure proper file permissions for mounted volumes. This can be overridden via the `extraPodSpec.securityContext` field.
- **Shared Memory**: All components receive an 8Gi shared memory volume mounted at `/dev/shm` by default (can be disabled or resized via the `sharedMemory` field).
- **Environment Variables**: Components automatically receive environment variables like `DYN_NAMESPACE`, `DYN_PARENT_DGD_K8S_NAME`, `DYNAMO_PORT`, and backend-specific variables.
- **Pod Configuration**: Default `terminationGracePeriodSeconds` of 60 seconds and `restartPolicy: Always`.
- **Autoscaling**: When enabled without explicit metrics, defaults to CPU-based autoscaling with 80% target utilization.
- **Backend-Specific Behavior**: For multinode deployments, probes are automatically modified or removed for worker nodes depending on the backend framework (VLLM, SGLang, or TensorRT-LLM).
## Pod Specification Defaults
All components receive the following pod-level defaults unless overridden:
- **`terminationGracePeriodSeconds`**: `60` seconds
- **`restartPolicy`**: `Always`
## Security Context
The operator automatically applies default security context settings to all components to ensure proper file permissions, particularly for mounted volumes:
- **`fsGroup`**: `1000` - Sets the group ownership of mounted volumes and any files created in those volumes
This default ensures that non-root containers can write to mounted volumes (like model caches or persistent storage) without permission issues. The `fsGroup` setting is particularly important for:
- Model downloads and caching
- Compilation cache directories
- Persistent volume claims (PVCs)
- SSH key generation in multinode deployments
### Overriding Security Context
To override the default security context, specify your own `securityContext` in the `extraPodSpec` of your component:
```yaml
services:
YourWorker:
extraPodSpec:
securityContext:
fsGroup: 2000 # Custom group ID
runAsUser: 1000
runAsGroup: 1000
runAsNonRoot: true
```
**Important**: When you provide *any* `securityContext` object in `extraPodSpec`, the operator will not inject any defaults. This gives you complete control over the security context, including the ability to run as root (by omitting `runAsNonRoot` or setting it to `false`).
### OpenShift and Security Context Constraints
In OpenShift environments with Security Context Constraints (SCCs), you may need to omit explicit UID/GID values to allow OpenShift's admission controllers to assign them dynamically:
```yaml
services:
YourWorker:
extraPodSpec:
securityContext:
# Omit fsGroup to let OpenShift assign it based on SCC
# OpenShift will inject the appropriate UID range
```
Alternatively, if you want to keep the default `fsGroup: 1000` behavior and are certain your cluster allows it, you don't need to specify anything - the operator defaults will work.
## Shared Memory Configuration
Shared memory is enabled by default for all components:
- **Enabled**: `true` (unless explicitly disabled via `sharedMemory.disabled`)
- **Size**: `8Gi`
- **Mount Path**: `/dev/shm`
- **Volume Type**: `emptyDir` with `memory` medium
To disable shared memory or customize the size, use the `sharedMemory` field in your component specification.
## Health Probes by Component Type
The operator applies different default health probes based on the component type.
### Frontend Components
Frontend components receive the following probe configurations:
**Liveness Probe:**
- **Type**: HTTP GET
- **Path**: `/health`
- **Port**: `http` (8000)
- **Initial Delay**: 60 seconds
- **Period**: 60 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 10
**Readiness Probe:**
- **Type**: Exec command
- **Command**: `curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""`
- **Initial Delay**: 60 seconds
- **Period**: 60 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 10
### Worker Components
Worker components receive the following probe configurations:
**Liveness Probe:**
- **Type**: HTTP GET
- **Path**: `/live`
- **Port**: `system` (9090)
- **Period**: 5 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 1
**Readiness Probe:**
- **Type**: HTTP GET
- **Path**: `/health`
- **Port**: `system` (9090)
- **Period**: 10 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 60
**Startup Probe:**
- **Type**: HTTP GET
- **Path**: `/live`
- **Port**: `system` (9090)
- **Period**: 10 seconds
- **Timeout**: 5 seconds
- **Failure Threshold**: 720 (allows up to 2 hours for startup: 10s × 720 = 7200s)
:::{note}
For larger models (typically >70B parameters) or slower storage systems, you may need to increase the `failureThreshold` to allow more time for model loading. Calculate the required threshold based on your expected startup time: `failureThreshold = (expected_startup_seconds / period)`. Override the startup probe in your component specification if the default 2-hour window is insufficient.
:::
### Multinode Deployment Probe Modifications
For multinode deployments, the operator modifies probes based on the backend framework and node role:
#### VLLM Backend
The operator automatically selects between two deployment modes based on parallelism configuration:
**Tensor/Pipeline Parallel Mode** (when `world_size > GPUs_per_node`):
- Uses Ray for distributed execution (`--distributed-executor-backend ray`)
- **Leader nodes**: Starts Ray head and runs vLLM; all probes remain active
- **Worker nodes**: Run Ray agents only; all probes (liveness, readiness, startup) are removed
**Data Parallel Mode** (when `world_size × data_parallel_size > GPUs_per_node`):
- **Worker nodes**: All probes (liveness, readiness, startup) are removed
- **Leader nodes**: All probes remain active
#### SGLang Backend
- **Worker nodes**: All probes (liveness, readiness, startup) are removed
#### TensorRT-LLM Backend
- **Leader nodes**: All probes remain unchanged
- **Worker nodes**:
- Liveness and startup probes are removed
- Readiness probe is replaced with a TCP socket check on SSH port (2222):
- **Initial Delay**: 20 seconds
- **Period**: 20 seconds
- **Timeout**: 5 seconds
- **Failure Threshold**: 10
## Environment Variables
The operator automatically injects environment variables based on component type and configuration:
### All Components
- **`DYN_NAMESPACE`**: The Dynamo namespace for the component
- **`DYN_PARENT_DGD_K8S_NAME`**: The parent DynamoGraphDeployment Kubernetes resource name
- **`DYN_PARENT_DGD_K8S_NAMESPACE`**: The parent DynamoGraphDeployment Kubernetes namespace
### Frontend Components
- **`DYNAMO_PORT`**: `8000`
- **`DYN_HTTP_PORT`**: `8000`
### Worker Components
- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server)
- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]`
- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older)
### Planner Components
- **`PLANNER_PROMETHEUS_PORT`**: `9085`
### VLLM Backend (with compilation cache)
When a volume mount is configured with `useAsCompilationCache: true`:
- **`VLLM_CACHE_ROOT`**: Set to the mount point of the cache volume
## Service Account
Planner components automatically receive the following service account:
- **`serviceAccountName`**: `planner-serviceaccount`
## Image Pull Secrets
The operator automatically discovers and injects image pull secrets for container images. When a component specifies a container image, the operator:
1. Scans all Kubernetes secrets of type `kubernetes.io/dockerconfigjson` in the component's namespace
2. Extracts the docker registry server URLs from each secret's authentication configuration
3. Matches the container image's registry host against the discovered registry URLs
4. Automatically injects matching secrets as `imagePullSecrets` in the pod specification
This eliminates the need to manually specify image pull secrets for each component. The operator maintains an internal index of docker secrets and their associated registries, refreshing this index periodically.
**To disable automatic image pull secret discovery** for a specific component, add the following annotation:
```yaml
annotations:
nvidia.com/disable-image-pull-secret-discovery: "true"
```
## Autoscaling Defaults
When autoscaling is enabled but no metrics are specified, the operator applies:
- **Default Metric**: CPU utilization
- **Target Average Utilization**: `80%`
## Port Configurations
Default container ports are configured based on component type:
### Frontend Components
- **Port**: 8000
- **Protocol**: TCP
- **Name**: `http`
### Worker Components
- **Port**: 9090
- **Protocol**: TCP
- **Name**: `system`
### Planner Components
- **Port**: 9085
- **Protocol**: TCP
- **Name**: `metrics`
## Backend-Specific Configurations
### VLLM
- **Ray Head Port**: 6379 (for Ray cluster coordination in multinode TP/PP deployments)
- **Data Parallel RPC Port**: 13445 (for data parallel multinode deployments)
### SGLang
- **Distribution Init Port**: 29500 (for multinode deployments)
### TensorRT-LLM
- **SSH Port**: 2222 (for multinode MPI communication)
- **OpenMPI Environment**: `OMPI_MCA_orte_keep_fqdn_hostnames=1`
## Implementation Reference
For users who want to understand the implementation details or contribute to the operator, the default values described in this document are set in the following source files:
- **Health Probes, Security Context & Pod Specifications**: [`internal/dynamo/graph.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/graph.go) - Contains the main logic for applying default probes, security context, environment variables, shared memory, and pod configurations
- **Component-Specific Defaults**:
- [`internal/dynamo/component_frontend.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_frontend.go)
- [`internal/dynamo/component_worker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_worker.go)
- [`internal/dynamo/component_planner.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_planner.go)
- **Image Pull Secrets**: [`internal/secrets/docker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/secrets/docker.go) - Implements the docker secret indexer and automatic discovery
- **Backend-Specific Behavior**:
- [`internal/dynamo/backend_vllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_vllm.go)
- [`internal/dynamo/backend_sglang.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_sglang.go)
- [`internal/dynamo/backend_trtllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_trtllm.go)
- **Constants & Annotations**: [`internal/consts/consts.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/consts/consts.go) - Defines annotation keys and other constants
## Notes
- All these defaults can be overridden by explicitly specifying values in your DynamoComponentDeployment or DynamoGraphDeployment resources
- User-specified probes (via `livenessProbe`, `readinessProbe`, or `startupProbe` fields) take precedence over operator defaults
- For security context, if you provide *any* `securityContext` in `extraPodSpec`, no defaults will be injected, giving you full control
- For multinode deployments, some defaults are modified or removed as described above to accommodate distributed execution patterns
- The `extraPodSpec.mainContainer` field can be used to override probe configurations set by the operator
......@@ -227,14 +227,15 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery
Cluster-scoped operators can optionally enable automatic GPU discovery:
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
```yaml
spec:
enableGpuDiscovery: true
```
- Hardware information (GPU model, VRAM, GPUs per node)
- Automatic calculation of profiling search space based on model size
- Hardware system identifier for AI Configurator integration
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
This is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions.
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
## Configuration
......
......@@ -462,8 +462,8 @@ _Appears in:_
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false | |
| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `enableGpuDiscovery` _boolean_ | EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.<br />DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always<br />attempted automatically. Setting this field has no effect - the operator will always try to discover<br />GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped<br />operator without permissions), manual hardware configuration is required regardless of this setting. | true | Optional: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
......
......@@ -18,6 +18,7 @@ project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from dynamo.profiler.profile_sla import run_profile # noqa: E402
from dynamo.profiler.utils.defaults import SearchStrategy # noqa: E402
from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402
pytestmark = [
......@@ -44,7 +45,7 @@ class TestProfileSlaAiconfigurator:
def llm_args(self, request):
class Args:
def __init__(self):
self.model = ""
self.model = "Qwen/Qwen3-32B" # Set to match aic_hf_id for consistency
self.dgd_image = ""
self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml"
......@@ -63,14 +64,13 @@ class TestProfileSlaAiconfigurator:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = False
self.use_ai_configurator = True
self.aic_system = "h200_sxm"
self.aic_hf_id = "Qwen/Qwen3-32B"
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
# Use RAPID strategy to leverage AI Configurator for perf estimation
# This avoids Kubernetes deployments while testing aiconfigurator functionality
self.search_strategy = SearchStrategy.RAPID
self.system = "h200_sxm" # Must match aic_system for RAPID strategy
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
......@@ -86,10 +86,10 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.performance
@pytest.mark.parallel
@pytest.mark.asyncio
@pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
@pytest.mark.parametrize("missing_arg", ["system", "model"])
async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
# Check that validation error happens when a required arg is missing.
# Note: aic_backend_version is optional - when None, auto-detects latest version
# Check that validation error happens when a required arg is missing for RAPID strategy.
# These args are required when using SearchStrategy.RAPID with AI Configurator.
setattr(llm_args, missing_arg, None)
with pytest.raises(ValueError):
await run_profile(llm_args)
......@@ -103,8 +103,7 @@ class TestProfileSlaAiconfigurator:
"arg_name, bad_value",
[
# these values don't exist in the aiconfigurator database.
("aic_system", "fake_gpu_system"),
("aic_backend_version", "0.1.0"),
("system", "fake_gpu_system"),
],
)
async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
......@@ -131,14 +130,11 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.nightly
# fmt: off
@pytest.mark.parametrize(
"backend, aic_backend_version",
"backend",
[
pytest.param("trtllm", None, marks=pytest.mark.trtllm),
pytest.param("trtllm", "1.2.0rc5", marks=pytest.mark.trtllm),
pytest.param("vllm", None, marks=pytest.mark.vllm),
pytest.param("vllm", "0.12.0", marks=pytest.mark.vllm),
pytest.param("sglang", None, marks=pytest.mark.sglang),
pytest.param("sglang", "0.5.6.post2", marks=pytest.mark.sglang),
pytest.param("trtllm", marks=pytest.mark.trtllm),
pytest.param("vllm", marks=pytest.mark.vllm),
pytest.param("sglang", marks=pytest.mark.sglang),
],
)
# fmt: on
......@@ -149,11 +145,10 @@ class TestProfileSlaAiconfigurator:
"meta-llama/Llama-3.1-405B",
],
)
async def test_aiconfigurator_dense_models(
self, llm_args, hf_model_id, backend, aic_backend_version
):
# Test that profile_sla works with a variety of backend versions and model names.
llm_args.aic_hf_id = hf_model_id
async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
# Test that profile_sla works with a variety of backends and model names
# using AI Configurator's RAPID strategy for performance estimation.
# Backend version is not used with RAPID strategy - performance comes from AI Configurator.
llm_args.model = hf_model_id # Used by RAPID strategy
llm_args.backend = backend
llm_args.aic_backend_version = aic_backend_version
await run_profile(llm_args)
......@@ -19,6 +19,7 @@ project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from dynamo.profiler.profile_sla import run_profile # noqa: E402
from dynamo.profiler.utils.defaults import SearchStrategy # noqa: E402
from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402
from dynamo.profiler.utils.search_space_autogen import ( # noqa: E402
auto_generate_search_space,
......@@ -66,12 +67,13 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
......@@ -113,12 +115,13 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
......@@ -181,12 +184,13 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_cache_pvc_name = ""
......@@ -238,12 +242,13 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.search_strategy = SearchStrategy.THOROUGH
self.system = ""
self.deploy_after_profile = False
self.pick_with_webui = False
# Added in newer profiler versions; keep Args compatible with search_space_autogen
......@@ -318,16 +323,14 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
# Set to 0 to trigger auto-generation path
self.num_gpus_per_node = 0
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
......@@ -340,27 +343,24 @@ class TestProfileSLADryRun:
@pytest.mark.integration
@pytest.mark.gpu_0
@pytest.mark.vllm
@patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
mock_get_gpu_summary,
vllm_args_with_model_autogen,
mock_h100_gpu_info,
mock_model_info,
):
"""Test profile_sla with auto-generated search space on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory.
size and available GPU memory. GPU info is provided via command-line
arguments injected by the Operator into the profiling config (DYN-2135).
"""
# Configure the mocks to return the appropriate info
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
mock_get_gpu_summary.return_value = mock_h100_gpu_info
# Run the profile - the search space will be auto-generated
# based on the model and mocked GPU info
# based on the model and GPU info from args
auto_generate_search_space(vllm_args_with_model_autogen)
await run_profile(vllm_args_with_model_autogen)
......@@ -390,15 +390,14 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 0
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
......@@ -411,27 +410,33 @@ class TestProfileSLADryRun:
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.sglang
@patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
@pytest.mark.skip(
reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
"is in legacy format missing 'gemm_dtype' field. "
"See: KeyError in aiconfigurator/sdk/perf_database.py"
)
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_sglang_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
mock_get_gpu_summary,
sglang_args_with_model_autogen,
mock_h100_gpu_info,
mock_model_info,
):
"""Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory for sglang backend.
size and available GPU memory for sglang backend. GPU info is provided via
command-line arguments injected by the Operator into the profiling config (DYN-2135).
NOTE: Currently skipped due to AI Configurator database format issue.
The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
the required 'gemm_dtype' field, causing KeyError during database loading.
"""
# Configure the mocks to return the appropriate info
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
mock_get_gpu_summary.return_value = mock_h100_gpu_info
# Run the profile - the search space will be auto-generated
# based on the model and mocked GPU info
# based on the model and GPU info from args
auto_generate_search_space(sglang_args_with_model_autogen)
await run_profile(sglang_args_with_model_autogen)
......@@ -461,15 +466,14 @@ class TestProfileSLADryRun:
self.decode_interpolation_granularity = 6
self.service_name = ""
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_hf_id = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 0
self.system = "h100_sxm" # Renamed from aic_system, moved to hardware
self.search_strategy = SearchStrategy.RAPID # New top-level arg
# GPU discovery values (auto-populated by Operator)
self.num_gpus_per_node = 8
self.gpu_model = "H100-SXM5-80GB"
self.gpu_vram_mib = 81920
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
self.model_cache_pvc_name = ""
self.model_cache_pvc_path = ""
self.model_cache_pvc_mount_path = "/opt/model-cache"
......@@ -482,26 +486,91 @@ class TestProfileSLADryRun:
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.trtllm
@patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
@patch("dynamo.profiler.utils.model_info.get_model_info")
async def test_trtllm_profile_with_autogen_search_space_h100(
self,
mock_get_model_info,
mock_get_gpu_summary,
trtllm_args_with_model_autogen,
mock_h100_gpu_info,
mock_model_info,
):
"""Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.
This test demonstrates how search space is auto-generated based on model
size and available GPU memory for trtllm backend.
size and available GPU memory for trtllm backend. GPU info is provided via
command-line arguments injected by the Operator into the profiling config (DYN-2135).
"""
# Configure the mocks to return the appropriate info
# Configure the mock to return the appropriate model info
mock_get_model_info.return_value = mock_model_info
mock_get_gpu_summary.return_value = mock_h100_gpu_info
# Run the profile - the search space will be auto-generated
# based on the model and mocked GPU info
# based on the model and GPU info from args
auto_generate_search_space(trtllm_args_with_model_autogen)
await run_profile(trtllm_args_with_model_autogen)
# Unit tests for search_strategy and system attributes
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_vllm_args_has_search_strategy(self, vllm_args):
"""Test that vllm_args fixture has search_strategy attribute."""
assert hasattr(vllm_args, "search_strategy")
assert vllm_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(vllm_args, "system")
assert vllm_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_sglang_args_has_search_strategy(self, sglang_args):
"""Test that sglang_args fixture has search_strategy attribute."""
assert hasattr(sglang_args, "search_strategy")
assert sglang_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(sglang_args, "system")
assert sglang_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_trtllm_args_has_search_strategy(self, trtllm_args):
"""Test that trtllm_args fixture has search_strategy attribute."""
assert hasattr(trtllm_args, "search_strategy")
assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(trtllm_args, "system")
assert trtllm_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
"""Test that sglang_moe_args fixture has search_strategy attribute."""
assert hasattr(sglang_moe_args, "search_strategy")
assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
assert hasattr(sglang_moe_args, "system")
assert sglang_moe_args.system == ""
@pytest.mark.pre_merge
@pytest.mark.unit
@pytest.mark.gpu_0
def test_model_autogen_args_have_rapid_strategy(
self,
vllm_args_with_model_autogen,
sglang_args_with_model_autogen,
trtllm_args_with_model_autogen,
):
"""Test that model autogen fixtures have RAPID search strategy and GPU info."""
for args_fixture in [
vllm_args_with_model_autogen,
sglang_args_with_model_autogen,
trtllm_args_with_model_autogen,
]:
assert hasattr(args_fixture, "search_strategy")
assert args_fixture.search_strategy == SearchStrategy.RAPID
assert hasattr(args_fixture, "system")
assert args_fixture.system == "h100_sxm"
# Verify GPU discovery attributes
assert hasattr(args_fixture, "num_gpus_per_node")
assert args_fixture.num_gpus_per_node == 8
assert hasattr(args_fixture, "gpu_model")
assert args_fixture.gpu_model == "H100-SXM5-80GB"
assert hasattr(args_fixture, "gpu_vram_mib")
assert args_fixture.gpu_vram_mib == 81920
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment