Unverified Commit 6fc4c595 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: remove cluster wide logic from namespace restricted operator (#3934)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 22d910a5
......@@ -158,13 +158,13 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument(
"--min-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0),
default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1),
help="minimum number of GPUs per engine",
)
parser.add_argument(
"--max-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0),
default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8),
help="maximum number of GPUs per engine",
)
parser.add_argument(
......@@ -245,9 +245,15 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=config.get("hardware", {}).get("num_gpus_per_node", 0),
default=config.get("hardware", {}).get("num_gpus_per_node", 8),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
parser.add_argument(
"--enable-gpu-discovery",
action="store_true",
default=config.get("hardware", {}).get("enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
......@@ -305,6 +311,9 @@ def create_profiler_parser() -> argparse.Namespace:
if not args.model and not args.config:
parser.error("--model or --config is required (provide at least one)")
auto_generate_search_space(args)
# Run auto-generation if GPU discovery is enabled
# This will override any manually specified hardware parameters
if args.enable_gpu_discovery:
auto_generate_search_space(args)
return args
......@@ -138,6 +138,15 @@ spec:
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
type: object
enableGpuDiscovery:
default: false
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
type: boolean
model:
description: |-
Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
......
......@@ -37,7 +37,7 @@ Prevents all conflict scenarios:
{{- end -}}
{{- if $namespaceRestrictedOperators -}}
{{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}}
{{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set dynamo-operator.namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}}
{{- end -}}
{{- end -}}
......
......@@ -124,9 +124,7 @@ spec:
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
{{- end }}
{{- if .Values.namespaceRestriction.enabled }}
- --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
{{- else }}
{{- if not .Values.namespaceRestriction.enabled }}
- --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
{{- end }}
......
......@@ -70,35 +70,6 @@ roleRef:
kind: Role
name: dgdr-profiling-job
subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
rules:
# Nodes - cluster-scoped resource needed for profiling
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
......
......@@ -114,6 +114,15 @@ type DynamoGraphDeploymentRequestSpec struct {
// +kubebuilder:validation:Enum=vllm;sglang;trtllm
Backend string `json:"backend"`
// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
// any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
// num_gpus_per_node) with values detected from the cluster.
// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
// +kubebuilder:default=false
// +kubebuilder:validation:Optional
EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"`
// ProfilingConfig provides the complete configuration for the profiling job.
// This configuration is passed directly to the profiler.
// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
......
......@@ -138,6 +138,15 @@ spec:
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
type: object
enableGpuDiscovery:
default: false
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
type: boolean
model:
description: |-
Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
......
......@@ -720,6 +720,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
return errors.New("profilingConfig.config is required and must not be empty")
}
// Validate enableGpuDiscovery is only true for cluster-wide operators
if dgdr.Spec.EnableGpuDiscovery && r.Config.RestrictedNamespace != "" {
return errors.New("enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in profilingConfig.config")
}
// Validate ConfigMap if provided (for the DGD base config)
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
cm := &corev1.ConfigMap{}
......@@ -937,6 +942,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
"--profile-config", string(configYAML),
}
// Add --enable-gpu-discovery flag based on DGDR spec
// GPU discovery requires cluster-wide node access
if dgdr.Spec.EnableGpuDiscovery {
profilerArgs = append(profilerArgs, "--enable-gpu-discovery")
}
// Use profiler image from profilingConfig
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
logger.Info("Using profiler image", "image", imageName)
......
......@@ -42,16 +42,45 @@ The recommended way to profile models is through DGDRs. Sample configurations ar
- **`profile_sla_moe_dgdr.yaml`**: MoE model profiling
The Dynamo Operator automatically:
1. Discovers GPU resources
1. Discovers GPU resources (cluster-scoped operators only)
2. Runs profiling (AIPerf on real engines or AI Configurator simulation)
3. Generates optimal DGD configuration with SLA planner
4. Deploys the DGD to your cluster
See the [Quick Start Guide](/docs/planner/sla_planner_quickstart.md) for prerequisites and detailed instructions.
## Hardware Configuration
Hardware parameters have sensible defaults and are **optional** - you can override them if needed:
```yaml
profilingConfig:
config:
# Override hardware defaults if needed
hardware:
min_num_gpus_per_engine: 1
max_num_gpus_per_engine: 8
num_gpus_per_node: 8
# Only needed when using AI Configurator (sweep.use_ai_configurator: true)
sweep:
aic_system: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
```
### Automatic GPU Discovery (Optional Feature)
Cluster-scoped operators can optionally enable automatic GPU discovery to detect hardware from cluster nodes. When enabled, hardware config is auto-detected and overrides any manually specified values.
```yaml
spec:
enableGpuDiscovery: true
```
This feature is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. It is not available for namespace-restricted operators.
## Profiling Method
1. **GPU Discovery**: Detects available GPUs and their specifications
1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models.
3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings. For dense models, we test different TP sizes for both prefill and decode. For MoE models, we test different TEP sizes for prefill and DEP sizes for decode.
- **Prefill**: For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), we directly measure the TTFT for a request with given isl without kv-reusing. For example, the below plot shows the prefill parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B.
......
......@@ -77,7 +77,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\} <br /> |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {} <br /> |
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
......@@ -95,11 +95,11 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: \{\} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: \{\} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: \{\} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\} <br /> |
| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\} <br /> |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: {} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: {} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: {} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {} <br /> |
| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: {} <br /> |
#### DeploymentStatus
......@@ -237,6 +237,7 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests
It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
......@@ -245,6 +246,7 @@ Lifecycle:
5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point.
......@@ -276,11 +278,12 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend to use.<br />The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: {} <br /> |
| `backend` _string_ | Backend specifies the inference backend to use.<br />The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm] <br />Required: {} <br /> |
| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,<br />num_gpus_per_node) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: {} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: {} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: {} <br /> |
#### DynamoGraphDeploymentRequestStatus
......@@ -298,12 +301,12 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: {} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: \{\} <br /> |
| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\} <br /> |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: {} <br /> |
| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {} <br />Optional: {} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {} <br /> |
#### DynamoGraphDeploymentSpec
......@@ -319,9 +322,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: \{\} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: \{\} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: \{\} <br /> |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: {} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: {} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: {} <br /> |
| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm] <br /> |
......@@ -415,9 +418,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `create` _boolean_ | Create indicates to create a new PVC | | |
| `name` _string_ | Name is the name of the PVC | | Required: \{\} <br /> |
| `name` _string_ | Name is the name of the PVC | | Required: {} <br /> |
| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | |
......@@ -436,9 +439,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: \{\} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: \{\} <br /> |
| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\} <br /> |
| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: {} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: {} <br /> |
| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: {} <br /> |
#### SharedMemorySpec
......@@ -456,7 +459,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `disabled` _boolean_ | | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | |
| `size` _[Quantity](#quantity)_ | | | |
#### VolumeMount
......@@ -473,7 +476,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\} <br /> |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {} <br /> |
| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. | | |
| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false | |
......
......@@ -229,26 +229,17 @@ sweep:
# Offline Profiling (AI Configurator - TensorRT-LLM only)
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: QWEN3_32B
backend_version: "0.20.0"
aic_system: h200_sxm
aic_model_name: QWEN3_32B
aic_backend_version: "0.20.0"
```
> [!NOTE]
> For detailed comparison, supported configurations, and limitations, see [SLA-Driven Profiling Documentation](/docs/benchmarks/sla_driven_profiling.md#profiling-methods).
### GPU Discovery
### Hardware Configuration
By default, the DGDR controller automatically discovers available GPU resources. Optionally specify preferences:
```yaml
spec:
gpu:
type: h200 # GPU type (e.g., h100, h200)
count: 8 # Number of GPUs to use
memoryGB: 141 # GPU memory in GB
```
For details on hardware configuration and GPU discovery options, see [Hardware Configuration in SLA-Driven Profiling](/docs/benchmarks/sla_driven_profiling.md#hardware-configuration).
### Advanced Configuration
......
......@@ -242,8 +242,12 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
self.min_num_gpus_per_engine = 0 # Will be auto-generated
self.max_num_gpus_per_engine = 0 # Will be auto-generated
self.min_num_gpus_per_engine = (
1 # Will be overridden by auto-generation
)
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
......@@ -261,7 +265,7 @@ class TestProfileSLADryRun:
self.aic_model_name = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = None # Will be auto-generated
self.num_gpus_per_node = 8 # Will be overridden by auto-generation
self.deploy_after_profile = False
return Args()
......@@ -304,8 +308,12 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
self.min_num_gpus_per_engine = 0 # Will be auto-generated
self.max_num_gpus_per_engine = 0 # Will be auto-generated
self.min_num_gpus_per_engine = (
1 # Will be overridden by auto-generation
)
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
......@@ -323,7 +331,7 @@ class TestProfileSLADryRun:
self.aic_model_name = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = None # Will be auto-generated
self.num_gpus_per_node = 8 # Will be overridden by auto-generation
self.deploy_after_profile = False
return Args()
......@@ -366,8 +374,12 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = ""
self.min_num_gpus_per_engine = 0 # Will be auto-generated
self.max_num_gpus_per_engine = 0 # Will be auto-generated
self.min_num_gpus_per_engine = (
1 # Will be overridden by auto-generation
)
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
......@@ -385,7 +397,7 @@ class TestProfileSLADryRun:
self.aic_model_name = None
self.aic_backend = ""
self.aic_backend_version = None
self.num_gpus_per_node = None # Will be auto-generated
self.num_gpus_per_node = 8 # Will be overridden by auto-generation
self.deploy_after_profile = False
return Args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment