"vllm/vscode:/vscode.git/clone" did not exist on "ace32edb319aefb43661d446e66ac84b1756c6be"
Unverified Commit 4c648b11 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

refactor: move core logics of DPP -> AIC and support static profiling (#6285)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent f6d4351f
......@@ -1327,7 +1327,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />Can be a HuggingFace ID or a private model name. | | MinLength: 1 <br />Required: \{\} <br /> |
| `backend` _[BackendType](#backendtype)_ | Backend specifies the inference backend to use for profiling and deployment. | auto | Enum: [auto sglang trtllm vllm] <br />Optional: \{\} <br /> |
| `image` _string_ | Image is the container image reference for the profiling job (frontend image).<br />Example: "nvcr.io/nvidia/dynamo-runtime:latest"<br />backend type automatically; backend images can be overridden via overrides.dgd. | | Optional: \{\} <br /> |
| `image` _string_ | Image is the container image reference for the profiling job (frontend image).<br />Example: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0". | | Optional: \{\} <br /> |
| `modelCache` _[ModelCacheSpec](#modelcachespec)_ | ModelCache provides optional PVC configuration for pre-downloaded model weights.<br />When provided, weights are loaded from the PVC instead of downloading from HuggingFace. | | Optional: \{\} <br /> |
| `hardware` _[HardwareSpec](#hardwarespec)_ | Hardware describes the hardware resources available for profiling and deployment.<br />Typically auto-filled by the operator from cluster discovery. | | Optional: \{\} <br /> |
| `workload` _[WorkloadSpec](#workloadspec)_ | Workload defines the expected workload characteristics for SLA-based profiling. | | Optional: \{\} <br /> |
......
......@@ -139,14 +139,14 @@ line_length = 88
balanced_wrapping = true
indent = " "
skip = ["build"]
known_first_party = ["dynamo"]
known_first_party = ["dynamo", "deploy"]
# isort may confuse what is 1st or 3rd library. e.g.
# when dynamo/vllm/omni/xx.py import vllm, local isort may treat this `vllm` as first
# party heuristically. This causes local sort differs from GitHub sort and pre-commit
# failure. To mitigate 1) one can install 3rd party lib so that isort is aware of it,
# 2) hardcode 3rd party lib here, 3) add "# isort: skip_file" to problematic files
# as the last resort.
known_third_party = ["vllm", "tensorrt_llm", "sglang"]
known_third_party = ["vllm", "tensorrt_llm", "sglang", "aiconfigurator"]
[tool.pytest.ini_options]
minversion = "8.0"
......@@ -187,6 +187,7 @@ filterwarnings = [
"ignore:.*unclosed event loop.*:ResourceWarning", # Ignore unclosed event loop warnings
"ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
"ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
"ignore:The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*:FutureWarning", # pandas 2.x concat deprecation in AIC SDK TODO: fix in AIC
# Pydantic V2 deprecation warnings from TRTLLM dependencies (raised at import time during collection)
"ignore:Support for class-based `config`.*:pydantic.warnings.PydanticDeprecatedSince20",
"ignore:Using extra keyword arguments on `Field`.*:pydantic.warnings.PydanticDeprecatedSince20",
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 10: Thorough sweep with DGD overrides for imagePullSecrets.
# Verifies that overrides can inject new spec-level fields (imagePullSecrets)
# that do not exist in the base DGD template.
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/dynamo:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 8
numGpusPerNode: 8
searchStrategy: thorough
overrides:
dgd:
spec:
imagePullSecrets:
- name: my-registry-secret
- name: nvcr-pull-secret
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 11: Auto backend, rapid, without planner, no input load
model: "Qwen/Qwen3-32B"
image: "hongkuanz196/trtllm-runtime:hzhou-0224"
hardware:
gpuSku: h200_sxm
totalGpus: 8
numGpusPerNode: 8
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 1: AIC supported model, rapid, without planner, no input load
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 8
numGpusPerNode: 8
sla:
itl: 50.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 2: AIC supported model, rapid, without planner, input load (request rate)
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 64
numGpusPerNode: 8
workload:
requestRate: 5.0
sla:
itl: 50.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 2b: AIC supported model, rapid, without planner, input load, with PVC model cache
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 64
numGpusPerNode: 8
modelCache:
pvcName: model-cache
pvcModelPath: /model/Qwen3-32B
workload:
requestRate: 5.0
sla:
itl: 50.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 2c: AIC supported model, rapid, without planner, e2eLatency instead of ttft/itl
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 64
numGpusPerNode: 8
workload:
requestRate: 5.0
sla:
ttft: null
itl: null
e2eLatency: 35000.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 2d: Both concurrency and requestRate specified — should fail validation
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 64
numGpusPerNode: 8
workload:
concurrency: 50
requestRate: 5.0
sla:
itl: 50.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 3: AIC supported model, rapid, with planner, rapid pre-deployment sweeping
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 8
numGpusPerNode: 8
sla:
itl: 50.0
features:
planner:
pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true
enable_load_scaling: false
mode: disagg
backend: trtllm
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 3b: AIC supported model, rapid, with planner, rapid pre-deployment sweeping, enable mocker
model: "Qwen/Qwen3-32B"
backend: trtllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: h200_sxm
totalGpus: 8
numGpusPerNode: 8
sla:
itl: 50.0
features:
planner:
pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true
enable_load_scaling: false
mode: disagg
backend: trtllm
mocker:
enabled: true
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Case 4: AIC unsupported model, rapid, without planner
# l40s + vllm has no disagg support in AIC
model: "Qwen/Qwen3-32B"
backend: vllm
image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest"
hardware:
gpuSku: l40s
totalGpus: 4
numGpusPerNode: 4
vramMb: 48000
sla:
itl: 50.0
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment