Unverified Commit d56439ec authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...


feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 233a1e9a
...@@ -28,6 +28,7 @@ import ( ...@@ -28,6 +28,7 @@ import (
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}` validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}` configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}` configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml` invalidYAML := `{invalid yaml`
...@@ -128,65 +129,19 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -128,65 +129,19 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
errMsg: "spec.profilingConfig.config is required and must not be empty", errMsg: "spec.profilingConfig.config is required and must not be empty",
}, },
{ {
name: "enableGpuDiscovery true for cluster-wide operator", name: "namespace-restricted operator (GPU discovery will fail gracefully)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr", Name: "test-dgdr",
Namespace: "default", Namespace: "default",
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "enableGpuDiscovery true for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
},
{
name: "enableGpuDiscovery false for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: false,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest", ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{ Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig), Raw: []byte(validConfigWithHardware),
}, },
}, },
}, },
...@@ -263,16 +218,15 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -263,16 +218,15 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)", expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
}, },
{ {
name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)", name: "multiple errors (missing profiler image and missing config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr", Name: "test-dgdr",
Namespace: "default", Namespace: "default",
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "", ProfilerImage: "",
Config: nil, Config: nil,
...@@ -281,9 +235,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -281,9 +235,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
isClusterWide: false, isClusterWide: false,
wantErr: true, wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators", errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
errContains: true, errContains: true,
}, },
// TODO: Add test for invalid GPU range (min > max) validation
// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
// but needs proper test coverage
} }
for _, tt := range tests { for _, tt := range tests {
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import logging
import re
import shutil
import subprocess
import time
import uuid
from dataclasses import asdict, dataclass
from typing import Dict, List, Optional, Tuple, Union
from kubernetes import client, config
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def run_command(cmd: List[str], capture_output: bool = True, exit_on_error: bool = True): # type: ignore
try:
return subprocess.run(cmd, capture_output=capture_output, text=True, check=True)
except subprocess.CalledProcessError as e: # pragma: no cover - passthrough
if exit_on_error:
logger.error(f"Command failed: {' '.join(cmd)}")
if e.stdout:
logger.error(e.stdout)
if e.stderr:
logger.error(e.stderr)
raise RuntimeError(f"Command failed: {' '.join(cmd)}")
raise
NVIDIA_PREFIX = "nvidia.com/"
LABEL_GPU_COUNT = f"{NVIDIA_PREFIX}gpu.count"
LABEL_GPU_PRODUCT = f"{NVIDIA_PREFIX}gpu.product"
LABEL_GPU_MEMORY = f"{NVIDIA_PREFIX}gpu.memory" # MiB per GPU
LABEL_MIG_CAPABLE = f"{NVIDIA_PREFIX}mig.capable"
@dataclass
class NodeGpuInventory:
node_name: str
gpu_count: Optional[int]
gpu_product: Optional[str]
gpu_memory_mib: Optional[int]
mig_capable: Optional[bool]
allocatable_gpu: Optional[int]
mig_resources: Dict[str, str]
def to_dict(self) -> Dict[str, Union[str, int, bool, Dict[str, str], None]]:
return asdict(self)
def _parse_int(value: Optional[str]) -> Optional[int]:
if value is None:
return None
try:
return int(value)
except (TypeError, ValueError):
match = re.search(r"\d+", str(value))
return int(match.group(0)) if match else None
def _bool_from_str(value: Optional[str]) -> Optional[bool]:
if value is None:
return None
s = str(value).strip().lower()
if s in {"true", "1", "yes"}:
return True
if s in {"false", "0", "no"}:
return False
return None
def _normalize_node(node: Union[client.V1Node, Dict]) -> Dict:
# Convert V1Node to dict for uniform access
if hasattr(node, "to_dict"):
return node.to_dict()
return node # assume already dict
def _extract_inventory(node_obj: Dict) -> NodeGpuInventory:
meta = node_obj.get("metadata", {})
status = node_obj.get("status", {})
labels = meta.get("labels", {}) or {}
node_name = meta.get("name", "<unknown>")
gpu_product = labels.get(LABEL_GPU_PRODUCT)
gpu_memory_mib = _parse_int(labels.get(LABEL_GPU_MEMORY))
mig_capable = _bool_from_str(labels.get(LABEL_MIG_CAPABLE))
# Prefer GFD-reported GPU count if present; otherwise use allocatable nvidia.com/gpu
gpu_count = _parse_int(labels.get(LABEL_GPU_COUNT))
alloc = status.get("allocatable", {}) or {}
alloc_gpu = _parse_int(alloc.get(f"{NVIDIA_PREFIX}gpu"))
if gpu_count is None:
gpu_count = alloc_gpu
# Collect MIG resource keys and counts if present
mig_resources: Dict[str, str] = {
k: str(v)
for k, v in alloc.items()
if isinstance(k, str)
and k.startswith(f"{NVIDIA_PREFIX}mig-")
and _parse_int(str(v))
}
return NodeGpuInventory(
node_name=node_name,
gpu_count=gpu_count,
gpu_product=gpu_product,
gpu_memory_mib=gpu_memory_mib,
mig_capable=mig_capable,
allocatable_gpu=alloc_gpu,
mig_resources=mig_resources,
)
def _list_nodes_via_client() -> List[Dict]:
# Assume running inside a Kubernetes pod with service account
try:
config.load_incluster_config()
except Exception as e:
raise RuntimeError(
f"Failed to load in-cluster Kubernetes config. Ensure this runs in a pod with a service account. Error: {e}"
)
v1 = client.CoreV1Api()
items = v1.list_node().items # type: ignore[attr-defined]
return [_normalize_node(n) for n in items]
def _list_nodes_via_kubectl() -> List[Dict]:
if not shutil.which("kubectl"):
raise RuntimeError("kubectl not found in PATH for fallback")
result = run_command(["kubectl", "get", "nodes", "-o", "json"], capture_output=True)
data = json.loads(result.stdout)
return data.get("items", [])
def collect_gpu_inventory(
prefer_client: bool = True,
) -> Tuple[List[NodeGpuInventory], str]:
sources_tried: List[str] = []
errors: List[str] = []
def _via_client() -> List[NodeGpuInventory]:
items = _list_nodes_via_client()
return [_extract_inventory(n) for n in items]
def _via_kubectl() -> List[NodeGpuInventory]:
items = _list_nodes_via_kubectl()
return [_extract_inventory(n) for n in items]
if prefer_client:
try:
sources_tried.append("kubernetes-client")
return _via_client(), ",".join(sources_tried)
except Exception as e:
errors.append(str(e))
try:
sources_tried.append("kubectl-json")
return _via_kubectl(), ",".join(sources_tried)
except Exception as e2:
errors.append(str(e2))
raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
else:
try:
sources_tried.append("kubectl-json")
return _via_kubectl(), ",".join(sources_tried)
except Exception as e:
errors.append(str(e))
try:
sources_tried.append("kubernetes-client")
return _via_client(), ",".join(sources_tried)
except Exception as e2:
errors.append(str(e2))
raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
def _format_gib(mib: Optional[int]) -> str:
if mib is None:
return ""
return f"{mib/1024:.1f} GiB"
def print_table(rows: List[NodeGpuInventory], show_mig: bool = False) -> None:
headers = ["NODE", "GPUS", "MODEL", "VRAM/GPU", "MIG"]
table: List[List[str]] = []
for r in rows:
mig_str = ""
if r.mig_capable is True:
if r.mig_resources:
mig_str = ",".join(
f"{k.split('/')[-1]}={v}"
for k, v in sorted(r.mig_resources.items())
)
else:
mig_str = "capable"
elif r.mig_capable is False:
mig_str = "no"
table.append(
[
r.node_name,
"" if r.gpu_count is None else str(r.gpu_count),
r.gpu_product or "",
_format_gib(r.gpu_memory_mib),
mig_str if show_mig else ("yes" if r.mig_capable else ""),
]
)
# Compute column widths
widths = [len(h) for h in headers]
for row in table:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(cell))
def _fmt_row(row: List[str]) -> str:
return " ".join(cell.ljust(widths[i]) for i, cell in enumerate(row))
logger.info(_fmt_row(headers))
logger.info(_fmt_row(["-" * w for w in widths]))
for row in table:
logger.info(_fmt_row(row))
def aggregate_valued_rows(
rows: List[NodeGpuInventory],
) -> Tuple[Optional[NodeGpuInventory], int]:
"""Aggregate rows that have meaningful GPU metadata.
Preference order when multiple distinct values exist:
1) Larger GPUs per node (gpu_count)
2) Larger VRAM per GPU (gpu_memory_mib)
Returns (selected_row_like, distinct_count).
"""
valued: List[NodeGpuInventory] = [
r for r in rows if (r.gpu_product is not None or r.gpu_memory_mib is not None)
]
if not valued:
return None, 0
# Group by (product, vram_mib)
from collections import defaultdict
groups: Dict[
Tuple[Optional[str], Optional[int]],
Dict[str, object],
] = defaultdict(lambda: {"max_gpu": 0, "rows": []})
for r in valued:
key = (r.gpu_product, r.gpu_memory_mib)
meta = groups[key]
meta["rows"].append(r) # type: ignore[attr-defined, index]
# Use known gpu_count if available for ranking
if r.gpu_count is not None:
meta["max_gpu"] = max(int(meta["max_gpu"]), int(r.gpu_count)) # type: ignore[arg-type, call-overload, index]
def sort_key(
item: Tuple[
Tuple[Optional[str], Optional[int]],
Dict[str, object],
]
):
(prod, mem_mib), meta = item
max_gpu = int(meta["max_gpu"]) # type: ignore[arg-type, call-overload, index]
mem_val = mem_mib if mem_mib is not None else -1
return (max_gpu, mem_val)
selected_key, selected_meta = sorted(groups.items(), key=sort_key, reverse=True)[0]
sel_prod, sel_mem_mib = selected_key
sel_gpu = int(selected_meta["max_gpu"]) # type: ignore[arg-type, call-overload, index]
selected = NodeGpuInventory(
node_name="<aggregate>",
gpu_count=sel_gpu if sel_gpu > 0 else None,
gpu_product=sel_prod,
gpu_memory_mib=sel_mem_mib,
mig_capable=None,
allocatable_gpu=None,
mig_resources={},
)
return selected, len(groups)
def _get_current_namespace(default: str = "default") -> str:
try:
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
return f.read().strip() or default
except Exception:
return default
def enrich_with_smi(
rows: List[NodeGpuInventory],
namespace: Optional[str] = None,
timeout_seconds: int = 180,
) -> None:
"""For nodes missing product/memory labels, schedule a short-lived pod on each node
that requests 1 GPU and runs nvidia-smi to capture model and memory.
Requires permissions: create/get/delete pods and get pods/log in the namespace.
"""
ns = namespace or _get_current_namespace()
try:
config.load_incluster_config()
except Exception:
pass
v1 = client.CoreV1Api()
for inv in rows:
if not inv.gpu_count or (
inv.gpu_product is not None and inv.gpu_memory_mib is not None
):
continue
pod_name = f"gpu-inv-smi-{uuid.uuid4().hex[:6]}"
container = client.V1Container(
name="smi",
image="nvidia/cuda:12.3.2-base-ubuntu22.04",
command=["bash", "-lc"],
args=[
"nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits"
],
resources=client.V1ResourceRequirements(
limits={"nvidia.com/gpu": "1", "cpu": "100m", "memory": "128Mi"},
requests={"nvidia.com/gpu": "1", "cpu": "50m", "memory": "64Mi"},
),
)
pod = client.V1Pod(
api_version="v1",
kind="Pod",
metadata=client.V1ObjectMeta(name=pod_name, namespace=ns),
spec=client.V1PodSpec(
restart_policy="Never",
node_name=inv.node_name,
containers=[container],
),
)
logs = ""
try:
v1.create_namespaced_pod(namespace=ns, body=pod)
start = time.time()
while time.time() - start < timeout_seconds:
p = v1.read_namespaced_pod(name=pod_name, namespace=ns)
phase = (p.status.phase or "").lower()
if phase in ("succeeded", "failed"):
break
time.sleep(2)
try:
logs = v1.read_namespaced_pod_log(name=pod_name, namespace=ns)
except Exception:
logs = ""
finally:
try:
v1.delete_namespaced_pod(
name=pod_name, namespace=ns, body=client.V1DeleteOptions()
)
except Exception:
pass
for line in logs.splitlines():
parts = [x.strip() for x in line.split(",")]
if len(parts) >= 2 and parts[0]:
inv.gpu_product = inv.gpu_product or parts[0]
mem_match = re.search(r"\d+", parts[1])
if mem_match:
inv.gpu_memory_mib = inv.gpu_memory_mib or int(mem_match.group(0))
break
def get_gpu_summary(
prefer_client: bool = True, enrich_smi: bool = True
) -> Dict[str, object]:
"""Return an aggregate GPU summary for the cluster.
Selection policy when multiple values exist: prefer higher GPUs per node,
then higher VRAM/GPU. Returns dict with keys: gpus_per_node, model, vram.
If model/VRAM unavailable anywhere, returns {"gpus_per_node": max_gpus, "model": "", "vram": 0}.
"""
# TODO: use proper tools (i.e., DCGM) to get GPU inventory
rows, _ = collect_gpu_inventory(prefer_client=prefer_client)
if enrich_smi:
enrich_with_smi(rows)
agg, _distinct = aggregate_valued_rows(rows)
if agg is None:
# Fallback to max GPUs only
max_gpus = 0
for r in rows:
if r.gpu_count is not None:
max_gpus = max(max_gpus, int(r.gpu_count))
return {"gpus_per_node": max_gpus, "model": "", "vram": 0}
gpus_val = int(agg.gpu_count) if agg.gpu_count is not None else 0
model_val = agg.gpu_product or ""
vram_val = int(agg.gpu_memory_mib) if agg.gpu_memory_mib is not None else 0
return {
"gpus_per_node": gpus_val,
"model": model_val,
"vram": vram_val,
}
def main() -> None:
parser = argparse.ArgumentParser(
description="Report GPU inventory per Kubernetes node (count, SKU, VRAM)."
)
parser.add_argument(
"--format",
"-o",
choices=["table", "json"],
default="table",
help="Output format",
)
parser.add_argument(
"--prefer",
choices=["client", "kubectl"],
default="client",
help="Prefer Kubernetes Python client or kubectl JSON fallback",
)
parser.add_argument(
"--show-mig",
action="store_true",
help="In table output, show MIG resource types and counts",
)
parser.add_argument(
"--enrich-smi",
action="store_true",
help="Schedule short-lived pods per node to fetch model/VRAM via nvidia-smi",
)
parser.add_argument(
"--aggregate",
action="store_true",
help="Print a single representative (GPUs per node, MODEL, VRAM/GPU). Warn if multiple values exist",
)
args = parser.parse_args()
prefer_client = args.prefer == "client"
rows, source = collect_gpu_inventory(prefer_client=prefer_client)
if args.enrich_smi:
enrich_with_smi(rows)
if args.format == "json":
payload = {
"source": source,
"items": [r.to_dict() for r in rows],
}
logger.info(json.dumps(payload, indent=2))
return
# Table output
print_table(rows, show_mig=args.show_mig)
if args.aggregate:
agg, distinct = aggregate_valued_rows(rows)
if agg is None:
logger.warning("No nodes expose MODEL/VRAM; cannot aggregate")
return
if distinct > 1:
logger.warning(
f"Multiple distinct GPU model/VRAM pairs detected across nodes: {distinct}. Showing highest GPUs per node, then highest VRAM/GPU."
)
# Print concise aggregate line
model = agg.gpu_product or ""
vram = _format_gib(agg.gpu_memory_mib)
gpus = agg.gpu_count if agg.gpu_count is not None else ""
logger.info(f"Aggregate => GPUS: {gpus} MODEL: {model} VRAM/GPU: {vram}")
if __name__ == "__main__":
main()
This diff is collapsed.
This diff is collapsed.
...@@ -227,14 +227,15 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator# ...@@ -227,14 +227,15 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery ### Automatic GPU Discovery
Cluster-scoped operators can optionally enable automatic GPU discovery: The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
```yaml - Hardware information (GPU model, VRAM, GPUs per node)
spec: - Automatic calculation of profiling search space based on model size
enableGpuDiscovery: true - Hardware system identifier for AI Configurator integration
```
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
This is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
## Configuration ## Configuration
......
...@@ -462,8 +462,8 @@ _Appears in:_ ...@@ -462,8 +462,8 @@ _Appears in:_
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> | | `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> | | `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false | | | `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false | |
| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> | | `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> | | `enableGpuDiscovery` _boolean_ | EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.<br />DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always<br />attempted automatically. Setting this field has no effect - the operator will always try to discover<br />GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped<br />operator without permissions), manual hardware configuration is required regardless of this setting. | true | Optional: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | | | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> | | `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
......
...@@ -18,6 +18,7 @@ project_root = Path(__file__).parent.parent.parent ...@@ -18,6 +18,7 @@ project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root)) sys.path.insert(0, str(project_root))
from dynamo.profiler.profile_sla import run_profile # noqa: E402 from dynamo.profiler.profile_sla import run_profile # noqa: E402
from dynamo.profiler.utils.defaults import SearchStrategy # noqa: E402
from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402 from dynamo.profiler.utils.model_info import ModelInfo # noqa: E402
pytestmark = [ pytestmark = [
...@@ -44,7 +45,7 @@ class TestProfileSlaAiconfigurator: ...@@ -44,7 +45,7 @@ class TestProfileSlaAiconfigurator:
def llm_args(self, request): def llm_args(self, request):
class Args: class Args:
def __init__(self): def __init__(self):
self.model = "" self.model = "Qwen/Qwen3-32B" # Set to match aic_hf_id for consistency
self.dgd_image = "" self.dgd_image = ""
self.backend = "trtllm" self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml" self.config = "examples/backends/trtllm/deploy/disagg.yaml"
...@@ -63,14 +64,13 @@ class TestProfileSlaAiconfigurator: ...@@ -63,14 +64,13 @@ class TestProfileSlaAiconfigurator:
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.dry_run = False self.dry_run = False
self.use_ai_configurator = True
self.aic_system = "h200_sxm"
self.aic_hf_id = "Qwen/Qwen3-32B"
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
self.pick_with_webui = False self.pick_with_webui = False
# Use RAPID strategy to leverage AI Configurator for perf estimation
# This avoids Kubernetes deployments while testing aiconfigurator functionality
self.search_strategy = SearchStrategy.RAPID
self.system = "h200_sxm" # Must match aic_system for RAPID strategy
# Provide minimal model_info to avoid HF queries # Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo( self.model_info = ModelInfo(
model_size=16384.0, model_size=16384.0,
...@@ -86,10 +86,10 @@ class TestProfileSlaAiconfigurator: ...@@ -86,10 +86,10 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.performance @pytest.mark.performance
@pytest.mark.parallel @pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"]) @pytest.mark.parametrize("missing_arg", ["system", "model"])
async def test_aiconfigurator_missing_args(self, llm_args, missing_arg): async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
# Check that validation error happens when a required arg is missing. # Check that validation error happens when a required arg is missing for RAPID strategy.
# Note: aic_backend_version is optional - when None, auto-detects latest version # These args are required when using SearchStrategy.RAPID with AI Configurator.
setattr(llm_args, missing_arg, None) setattr(llm_args, missing_arg, None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
await run_profile(llm_args) await run_profile(llm_args)
...@@ -103,8 +103,7 @@ class TestProfileSlaAiconfigurator: ...@@ -103,8 +103,7 @@ class TestProfileSlaAiconfigurator:
"arg_name, bad_value", "arg_name, bad_value",
[ [
# these values don't exist in the aiconfigurator database. # these values don't exist in the aiconfigurator database.
("aic_system", "fake_gpu_system"), ("system", "fake_gpu_system"),
("aic_backend_version", "0.1.0"),
], ],
) )
async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value): async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
...@@ -131,14 +130,11 @@ class TestProfileSlaAiconfigurator: ...@@ -131,14 +130,11 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.nightly @pytest.mark.nightly
# fmt: off # fmt: off
@pytest.mark.parametrize( @pytest.mark.parametrize(
"backend, aic_backend_version", "backend",
[ [
pytest.param("trtllm", None, marks=pytest.mark.trtllm), pytest.param("trtllm", marks=pytest.mark.trtllm),
pytest.param("trtllm", "1.2.0rc5", marks=pytest.mark.trtllm), pytest.param("vllm", marks=pytest.mark.vllm),
pytest.param("vllm", None, marks=pytest.mark.vllm), pytest.param("sglang", marks=pytest.mark.sglang),
pytest.param("vllm", "0.12.0", marks=pytest.mark.vllm),
pytest.param("sglang", None, marks=pytest.mark.sglang),
pytest.param("sglang", "0.5.6.post2", marks=pytest.mark.sglang),
], ],
) )
# fmt: on # fmt: on
...@@ -149,11 +145,10 @@ class TestProfileSlaAiconfigurator: ...@@ -149,11 +145,10 @@ class TestProfileSlaAiconfigurator:
"meta-llama/Llama-3.1-405B", "meta-llama/Llama-3.1-405B",
], ],
) )
async def test_aiconfigurator_dense_models( async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
self, llm_args, hf_model_id, backend, aic_backend_version # Test that profile_sla works with a variety of backends and model names
): # using AI Configurator's RAPID strategy for performance estimation.
# Test that profile_sla works with a variety of backend versions and model names. # Backend version is not used with RAPID strategy - performance comes from AI Configurator.
llm_args.aic_hf_id = hf_model_id llm_args.model = hf_model_id # Used by RAPID strategy
llm_args.backend = backend llm_args.backend = backend
llm_args.aic_backend_version = aic_backend_version
await run_profile(llm_args) await run_profile(llm_args)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment