Unverified Commit 6a84ffd3 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: turn profiling k8s jobs into sample DGDR requests (#3864)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
parent 0d07e2c3
......@@ -179,7 +179,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res
Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
- **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements
- **[SLA-Driven Dynamo Deployments](docs/planner/sla_planner_quickstart.md)** – Optimize your deployment to meet SLA requirements
# Engines
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# DynamoGraphDeploymentRequest for AI Configurator-based profiling
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: sla-aic
spec:
model: Qwen/Qwen3-32B
backend: trtllm
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
config:
# Sweep/profiling configuration
sweep:
# AI Configurator mode (fast simulation-based profiling)
use_ai_configurator: true
aic_system: h200_sxm
aic_model_name: QWEN3_32B
aic_backend_version: "0.20.0"
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 500.0 # Time To First Token target (milliseconds)
itl: 30.0 # Inter-Token Latency target (milliseconds)
# Deployment overrides for the auto-created DGD
deploymentOverrides:
workersImage: "nvcr.io/nvidian/dynamo-dev/trtllm-runtime:dep-540.5"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: update to dgdr spec for AIC
apiVersion: batch/v1
kind: Job
metadata:
name: profile-sla
namespace: ${NAMESPACE}
spec:
template:
spec:
serviceAccountName: dynamo-sa
containers:
- name: profile-sla
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "16"
memory: "10Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: NATS_SERVER
value: nats://${NAMESPACE}-nats:4222
- name: ETCD_ENDPOINTS
value: ${NAMESPACE}-etcd:2379
command: ["python", "-m", "benchmarks.profiler.profile_sla"]
args:
- --config
- ${DGD_CONFIG_FILE}
- --output-dir
- /data/profiling_results
- --namespace
- ${NAMESPACE}
- --backend
- vllm
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "500"
- --itl
- "30"
- --use-ai-configurator
- --aic-system
- h200_sxm
- --aic-model-name
- QWEN3_32B
- --aic-backend-version
- 0.20.0
volumeMounts:
- name: output-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: output-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# DynamoGraphDeploymentRequest for standard online profiling
# Converted from profile_sla_job.yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: sla-online
spec:
model: Qwen/Qwen3-0.6B
backend: vllm
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
config:
# Sweep/profiling configuration
sweep:
skip_existing_results: true
# Standard online profiling (not using AI Configurator)
use_ai_configurator: false
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 200.0 # Time To First Token target (milliseconds)
itl: 20.0 # Inter-Token Latency target (milliseconds)
# Deployment overrides for the auto-created DGD
deploymentOverrides:
workersImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: update to dgdr spec for online mode
apiVersion: batch/v1
kind: Job
metadata:
name: profile-sla
namespace: ${NAMESPACE}
spec:
template:
spec:
serviceAccountName: dynamo-sa
containers:
- name: profile-sla
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "16"
memory: "10Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: NATS_SERVER
value: nats://${NAMESPACE}-nats:4222
- name: ETCD_ENDPOINTS
value: ${NAMESPACE}-etcd:2379
command: ["python", "-m", "benchmarks.profiler.profile_sla"]
args:
- --config
- ${DGD_CONFIG_FILE}
- --output-dir
- /data/profiling_results
- --namespace
- ${NAMESPACE}
- --backend
- vllm
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts:
- name: output-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: output-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# DynamoGraphDeploymentRequest for MoE model profiling
# Converted from profile_sla_moe_job.yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: sla-moe
spec:
model: deepseek-ai/DeepSeek-R1
backend: sglang
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
config:
# Engine configuration
engine:
is_moe_model: true # Enable MoE model support (uses TEP/DEP instead of TP)
# Sweep/profiling configuration
sweep:
# Standard online profiling (not using AI Configurator)
use_ai_configurator: false
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 200.0 # Time To First Token target (milliseconds)
itl: 20.0 # Inter-Token Latency target (milliseconds)
# Reference to ConfigMap containing the DGD base config
# For MoE models, this should point to the appropriate disagg config
# Original path: /sgl-workspace/dynamo/recipes/deepseek-r1/sglang/disagg-16gpu.yaml
configMapRef:
name: deepseek-r1-config
key: tep16p-dep16d-disagg.yaml
# Deployment overrides for the auto-created DGD
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: update to dgdr spec for MoE model
apiVersion: batch/v1
kind: Job
metadata:
name: profile-sla
namespace: ${NAMESPACE}
spec:
template:
spec:
serviceAccountName: dynamo-sa
containers:
- name: profile-sla
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "32"
memory: "50Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: NATS_SERVER
value: nats://${NAMESPACE}-nats:4222
- name: ETCD_ENDPOINTS
value: ${NAMESPACE}-etcd:2379
workingDir: /sgl-workspace/dynamo
command: ["python", "-m", "benchmarks.profiler.profile_sla"]
args:
- --config
- /sgl-workspace/dynamo/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml
- --output-dir
- /data/profiling_results
- --namespace
- ${NAMESPACE}
- --backend
- sglang
- --is-moe-model
- --min-num-gpus-per-engine
- "8"
- --max-num-gpus-per-engine
- "16"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts:
- name: output-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: output-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
......@@ -92,6 +92,11 @@ async def run_profile(args):
with open(args.config, "r") as f:
config = yaml.safe_load(f)
config = config_modifier.update_model(config, args.model)
if args.dgd_image:
config = config_modifier.update_image(config, args.dgd_image)
logger.info(f"Using DGD image: {args.dgd_image}")
if args.is_moe_model:
# For MoE models, use range with stride of num_gpus_per_node
profile_num_gpus = list(
......
......@@ -358,6 +358,29 @@ def set_argument_value(args: list, arg_name: str, value: str):
return args
def update_image(config: dict, image: str) -> dict:
"""Update container image for all DGD services (frontend, planner, workers).
This is a shared utility function used by all backend config modifiers.
Args:
config: Configuration dictionary
image: Container image to set for all services
Returns:
Updated configuration dictionary
"""
cfg = Config.model_validate(config)
# Update image for all services
for service_name, service_config in cfg.spec.services.items():
if service_config.extraPodSpec and service_config.extraPodSpec.mainContainer:
service_config.extraPodSpec.mainContainer.image = image
logger.debug(f"Updated image for {service_name} to {image}")
return cfg.model_dump()
class ConfigModifierProtocol(Protocol):
@classmethod
def convert_config(
......@@ -419,6 +442,10 @@ class ConfigModifierProtocol(Protocol):
def update_model(cls, config: dict, model_name: str) -> dict:
...
@classmethod
def update_image(cls, config: dict, image: str) -> dict:
...
def generate_dgd_config_with_planner(
config_path: str,
......@@ -450,6 +477,15 @@ def generate_dgd_config_with_planner(
with open(config_path, "r") as f:
config = yaml.safe_load(f)
# Update model name in config from profiling args
# This ensures the final DGD uses the model specified in the DGDR, not the default in the config file
config = config_modifier.update_model(config, args.model)
# Update container image if provided
# This overrides the default image in the config file for all DGD components
if args.dgd_image:
config = config_modifier.update_image(config, args.dgd_image)
if not is_moe_model:
# dense model, use TP for both prefill and decode
config = config_modifier.set_config_tp_size(
......
......@@ -16,6 +16,7 @@ from benchmarks.profiler.utils.config import (
remove_valued_arguments,
set_argument_value,
setup_worker_service_resources,
update_image,
validate_and_get_worker_args,
)
from benchmarks.profiler.utils.defaults import (
......@@ -72,6 +73,11 @@ class SGLangConfigModifier:
return cfg.model_dump()
@classmethod
def update_image(cls, config, image: str) -> dict:
"""Update container image for all DGD services (frontend, planner, workers)."""
return update_image(config, image)
@classmethod
def convert_config(
cls,
......
......@@ -18,6 +18,7 @@ from benchmarks.profiler.utils.config import (
remove_valued_arguments,
set_argument_value,
setup_worker_service_resources,
update_image,
validate_and_get_worker_args,
)
from benchmarks.profiler.utils.defaults import (
......@@ -74,6 +75,11 @@ class TrtllmConfigModifier:
return cfg.model_dump()
@classmethod
def update_image(cls, config, image: str) -> dict:
"""Update container image for all DGD services (frontend, planner, workers)."""
return update_image(config, image)
@classmethod
def convert_config(
cls,
......
......@@ -14,6 +14,7 @@ from benchmarks.profiler.utils.config import (
get_worker_service_from_config,
set_argument_value,
setup_worker_service_resources,
update_image,
validate_and_get_worker_args,
)
from benchmarks.profiler.utils.defaults import (
......@@ -69,6 +70,11 @@ class VllmV1ConfigModifier:
return cfg.model_dump()
@classmethod
def update_image(cls, config, image: str) -> dict:
"""Update container image for all DGD services (frontend, planner, workers)."""
return update_image(config, image)
@classmethod
def convert_config(
cls,
......
......@@ -121,6 +121,12 @@ def create_profiler_parser() -> argparse.Namespace:
default=config.get("deployment", {}).get("model", ""),
help="Model to serve, can be HF model name or local model path",
)
parser.add_argument(
"--dgd-image",
type=str,
default=config.get("deployment", {}).get("dgd_image", ""),
help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
)
# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
......@@ -295,10 +301,9 @@ def create_profiler_parser() -> argparse.Namespace:
delattr(args, "profile_config")
# Validate required arguments
if not args.config:
parser.error("--config is required (either via CLI or profile-config)")
# Either --model or --config (or both) must be provided
if not args.model and not args.config:
parser.error("--model or --config is required")
parser.error("--model or --config is required (provide at least one)")
auto_generate_search_space(args)
......
......@@ -53,7 +53,7 @@ Advanced disaggregated deployment with SLA-based automatic scaling.
- `TRTLLMPrefillWorker`: Specialized prefill-only worker
> [!NOTE]
> This deployment requires pre-deployment profiling to be completed first. See [Pre-Deployment Profiling](../../../../docs/benchmarks/pre_deployment_profiling.md) for detailed instructions.
> This deployment requires pre-deployment profiling to be completed first. See [Pre-Deployment Profiling](../../../../docs/benchmarks/sla_driven_profiling.md) for detailed instructions.
## CRD Structure
......
......@@ -99,7 +99,7 @@ We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/
### Pre-Deployment Profiling (SLA Planner Only)
If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../docs/benchmarks/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `dynamo-pvc` PVC and queried by the SLA Planner.
If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../docs/benchmarks/sla_driven_profiling.md) to run pre-deployment profiling. The results will be saved to the `dynamo-pvc` PVC and queried by the SLA Planner.
## Usage
......
......@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
MISSING_PROFILING_DATA_ERROR_MESSAGE = (
"SLA-Planner requires pre-deployment profiling results to run.\n"
"Please follow /docs/benchmarks/pre_deployment_profiling.md to run the profiling first,\n"
"Please follow /docs/benchmarks/sla_driven_profiling.md to run the profiling first,\n"
"and make sure the profiling results are present in --profile-results-dir."
)
......
......@@ -33,7 +33,7 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .spec.modelName
- jsonPath: .spec.model
name: Model
type: string
- jsonPath: .status.backend
......@@ -94,6 +94,15 @@ spec:
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec.
type: boolean
backend:
description: |-
Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend.
enum:
- vllm
- sglang
- trtllm
type: string
deploymentOverrides:
description: |-
DeploymentOverrides allows customizing metadata for the auto-created DGD.
......@@ -121,18 +130,27 @@ spec:
Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace.
type: string
workersImage:
description: |-
WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
type: object
modelName:
model:
description: |-
ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model.
type: string
profilingConfig:
description: |-
ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
The profiler will validate the configuration and report any errors.
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config.
properties:
config:
description: |-
......@@ -156,9 +174,18 @@ spec:
required:
- name
type: object
profilerImage:
description: |-
ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
required:
- profilerImage
type: object
required:
- modelName
- backend
- model
- profilingConfig
type: object
status:
......
......@@ -124,10 +124,9 @@ spec:
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
{{- end }}
{{- if .Values.dynamo.dgdr.profilerImage }}
- --profiler-image={{ .Values.dynamo.dgdr.profilerImage }}
{{- end }}
{{- if not .Values.namespaceRestriction.enabled }}
{{- if .Values.namespaceRestriction.enabled }}
- --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
{{- else }}
- --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
{{- end }}
......
......@@ -73,12 +73,11 @@ subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling-nodes
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
......@@ -87,7 +86,22 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
{{- else }}
# Cluster-wide mode: ClusterRole for DGDR profiling jobs
---
......@@ -122,21 +136,20 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling-nodes
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling-nodes
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling
subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
{{- end }}
......@@ -117,15 +117,6 @@ dynamo:
sshKeygen:
enabled: true
# DynamoGraphDeploymentRequest (DGDR) configuration
dgdr:
# Container image to use for profiling jobs (both online and offline/AIC)
# REQUIRED: Must be set to create DynamoGraphDeploymentRequests
# For development: Build and push the profiler image from the ai-dynamo repository
# Public image will be available in release 0.6.1
# Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
profilerImage: ""
#imagePullSecrets: []
kubernetesClusterDomain: cluster.local
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment