Unverified Commit ec5630ea authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: mount model path to Profiler if specified (#5212)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent c29f78c1
......@@ -17,10 +17,10 @@ spec:
# Sweep/profiling configuration
sweep:
# AI Configurator mode (fast simulation-based profiling)
use_ai_configurator: true
aic_system: h200_sxm
aic_hf_id: Qwen/Qwen3-32B
aic_backend_version: "0.20.0"
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0"
# SLA targets for profiling
sla:
......
......@@ -17,7 +17,7 @@ spec:
# Sweep/profiling configuration
sweep:
# Online profiling mode (real deployment testing)
use_ai_configurator: false
useAiConfigurator: false
# SLA targets for profiling
sla:
......
......@@ -15,16 +15,22 @@ spec:
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
config:
# Model cache PVC configuration; profiler will mount this PVC to access model weights
deployment:
modelCache:
pvcName: "model-cache" # Name of PVC containing model weights
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
# Sweep/profiling configuration
sweep:
# Standard online profiling (not using AI Configurator)
use_ai_configurator: false
useAiConfigurator: false
hardware:
# for h200, sweep over 8-16 GPUs per engine
min_num_gpus_per_engine: 8
max_num_gpus_per_engine: 16
num_gpus_per_node: 8
minNumGpusPerEngine: 8
maxNumGpusPerEngine: 16
numGpusPerNode: 8
# SLA targets for profiling
sla:
......
......@@ -12,6 +12,22 @@ from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_par
from benchmarks.profiler.utils.search_space_autogen import auto_generate_search_space
def _get(cfg: Dict[str, Any], camel: str, snake: str, default: Any = None) -> Any:
"""Get config value with camelCase preferred, snake_case fallback."""
if camel in cfg:
return cfg[camel]
return cfg.get(snake, default)
def _camel_to_snake(name: str) -> str:
"""Convert camelCase to snake_case."""
import re
# Insert underscore before uppercase letters and lowercase
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
def parse_config_string(config_str: str) -> Dict[str, Any]:
"""Parse configuration string as Python dict literal, YAML, or JSON.
......@@ -61,45 +77,48 @@ def create_profiler_parser() -> argparse.Namespace:
"""
Create argument parser with support for YAML config string.
Config structure:
output_dir: String (path to the output results directory, default: profiling_results)
Config structure (camelCase preferred, snake_case supported for backwards compat):
outputDir: String (path to the output results directory, default: profiling_results)
deployment:
namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
service_name: String (service name, default: "")
serviceName: String (service name, default: "")
model: String (served model name)
model_cache_pvc_name: String (name of the PVC to mount the model cache,
dgdImage: String (container image to use for DGD components (frontend, planner, workers), overrides images in config file)
modelCache:
pvcName: String (name of the PVC to mount the model cache,
if not provided, model must be HF name and will download from HF, default: "")
model_cache_pvc_path: String (path to the model cache in the PVC, default: "")
model_cache_pvc_mount_path: String (path to the model cache in the container,
pvcPath: String (path to the model cache in the PVC, default: "")
mountPath: String (path to the model cache in the container,
note that the PVC must be mounted to the same path for the profiling job,
default: "/opt/model-cache")
engine:
backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
config: String (path to the DynamoGraphDeployment config file, default: "")
max_context_length: Int (maximum context length supported by the served model, default: 0)
is_moe_model: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
maxContextLength: Int (maximum context length supported by the served model, default: 0)
isMoeModel: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
hardware:
min_num_gpus_per_engine: Int (minimum number of GPUs per engine, default: 0)
max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 0)
num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False)
sweep:
prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aic_system: String (target system for use with aiconfigurator, default: None)
aic_hf_id: String (aiconfigurator huggingface id of the target model, default: None)
aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dry_run: Boolean (dry run the profile job, default: False)
pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aicSystem: String (target system for use with aiconfigurator, default: None)
aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dryRun: Boolean (dry run the profile job, default: False)
pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
ttft: Float (target Time To First Token in milliseconds, default: 50)
itl: Float (target Inter Token Latency in milliseconds, default: 10)
planner: (planner-bypass arguments, use hyphens or underscores)
i.e., planner-min-endpoint: 2 # or planner_min_endpoint: 2 (both work)
planner: (planner arguments)
e.g., plannerMinEndpoint: 2
"""
# Step 1: Pre-parse to check if --profile-config is provided
pre_parser = argparse.ArgumentParser(add_help=False)
......@@ -130,37 +149,37 @@ def create_profiler_parser() -> argparse.Namespace:
default=config.get("deployment", {}).get("model", ""),
help="Served model name",
)
model_cache_config = config.get("deployment", {}).get("modelCache", {})
parser.add_argument(
"--model-cache-pvc-name",
type=str,
default=config.get("deployment", {}).get("model_cache_pvc_name", ""),
default=model_cache_config.get("pvcName", ""),
help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF",
)
parser.add_argument(
"--model-cache-pvc-path",
type=str,
default=config.get("deployment", {}).get("model_cache_pvc_path", ""),
default=model_cache_config.get("pvcPath", ""),
help="Path to the model cache in the PVC",
)
parser.add_argument(
"--model-cache-pvc-mount-path",
type=str,
default=config.get("deployment", {}).get(
"model_cache_pvc_mount_path", "/opt/model-cache"
),
default=model_cache_config.get("mountPath", "/opt/model-cache"),
help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job",
)
deployment_cfg = config.get("deployment", {})
parser.add_argument(
"--dgd-image",
type=str,
default=config.get("deployment", {}).get("dgd_image", ""),
default=_get(deployment_cfg, "dgdImage", "dgd_image", ""),
help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
)
parser.add_argument(
"--namespace",
type=str,
default=config.get("deployment", {}).get("namespace", "dynamo-sla-profiler"),
default=deployment_cfg.get("namespace", "dynamo-sla-profiler"),
help="Kubernetes namespace to deploy the DynamoGraphDeployment",
)
parser.add_argument(
......@@ -180,25 +199,26 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument(
"--output-dir",
type=str,
default=config.get("output_dir", "profiling_results"),
default=_get(config, "outputDir", "output_dir", "profiling_results"),
help="Path to the output results directory",
)
hardware_cfg = config.get("hardware", {})
parser.add_argument(
"--min-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0),
default=_get(hardware_cfg, "minNumGpusPerEngine", "min_num_gpus_per_engine", 0),
help="minimum number of GPUs per engine",
)
parser.add_argument(
"--max-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0),
default=_get(hardware_cfg, "maxNumGpusPerEngine", "max_num_gpus_per_engine", 0),
help="maximum number of GPUs per engine",
)
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=config.get("hardware", {}).get("num_gpus_per_node", 0),
default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
parser.add_argument(
......@@ -227,46 +247,58 @@ def create_profiler_parser() -> argparse.Namespace:
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
engine_cfg = config.get("engine", {})
parser.add_argument(
"--max-context-length",
type=int,
default=config.get("engine", {}).get("max_context_length", 0),
default=_get(engine_cfg, "maxContextLength", "max_context_length", 0),
help="maximum context length supported by the served model",
)
sweep_cfg = config.get("sweep", {})
parser.add_argument(
"--prefill-interpolation-granularity",
type=int,
default=config.get("sweep", {}).get("prefill_interpolation_granularity", 16),
default=_get(
sweep_cfg,
"prefillInterpolationGranularity",
"prefill_interpolation_granularity",
16,
),
help="how many samples to benchmark to interpolate TTFT under different ISL",
)
parser.add_argument(
"--decode-interpolation-granularity",
type=int,
default=config.get("sweep", {}).get("decode_interpolation_granularity", 6),
default=_get(
sweep_cfg,
"decodeInterpolationGranularity",
"decode_interpolation_granularity",
6,
),
help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
)
parser.add_argument(
"--service-name",
type=str,
default=config.get("deployment", {}).get("service_name", ""),
default=_get(deployment_cfg, "serviceName", "service_name", ""),
help="Service name for port forwarding (default: {deployment_name}-frontend)",
)
parser.add_argument(
"--dry-run",
action="store_true",
default=config.get("sweep", {}).get("dry_run", False),
default=_get(sweep_cfg, "dryRun", "dry_run", False),
help="Dry run the profile job",
)
parser.add_argument(
"--enable-gpu-discovery",
action="store_true",
default=config.get("hardware", {}).get("enable_gpu_discovery", False),
default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
parser.add_argument(
"--pick-with-webui",
action="store_true",
default=config.get("sweep", {}).get("pick_with_webui", False),
default=_get(sweep_cfg, "pickWithWebui", "pick_with_webui", False),
help="Pick the best parallelization mapping using webUI",
)
......@@ -277,19 +309,19 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument(
"--webui-port",
type=int,
default=config.get("sweep", {}).get("webui_port", default_webui_port),
default=_get(sweep_cfg, "webuiPort", "webui_port", default_webui_port),
help="WebUI port",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
# Set defaults for any planner arguments found in config.planner
# Note: argparse converts hyphens to underscores, so we need to normalize keys
# Normalize keys: camelCase -> snake_case, hyphens -> underscores
planner_config = config.get("planner", {})
if planner_config:
# Convert hyphens to underscores to match argparse's internal naming
normalized_planner_config = {
key.replace("-", "_"): value for key, value in planner_config.items()
_camel_to_snake(key).replace("-", "_"): value
for key, value in planner_config.items()
}
parser.set_defaults(**normalized_planner_config)
......@@ -297,31 +329,31 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument(
"--use-ai-configurator",
action="store_true",
default=config.get("sweep", {}).get("use_ai_configurator", False),
default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
default=config.get("sweep", {}).get("aic_system"),
default=_get(sweep_cfg, "aicSystem", "aic_system", None),
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-hf-id",
type=str,
default=config.get("sweep", {}).get("aic_hf_id"),
default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
)
parser.add_argument(
"--aic-backend",
type=str,
default=config.get("sweep", {}).get("aic_backend", ""),
default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
help="aiconfigurator backend of the target model, if not provided, will use args.backend",
)
parser.add_argument(
"--aic-backend-version",
type=str,
default=config.get("sweep", {}).get("aic_backend_version"),
default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
help="Specify backend version when using aiconfigurator to estimate perf.",
)
......
......@@ -144,8 +144,8 @@ spec:
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
numGpusPerNode) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
type: boolean
model:
......
......@@ -146,8 +146,8 @@ type DynamoGraphDeploymentRequestSpec struct {
// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
// any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
// num_gpus_per_node) with values detected from the cluster.
// any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
// numGpusPerNode) with values detected from the cluster.
// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
// +kubebuilder:default=false
// +kubebuilder:validation:Optional
......
......@@ -144,8 +144,8 @@ spec:
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
numGpusPerNode) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
type: boolean
model:
......
......@@ -37,24 +37,24 @@ spec:
# Engine configuration
engine:
max_context_length: 16384 # will override max context length of the model if provided
maxContextLength: 16384 # will override max context length of the model if provided
# Hardware configuration
hardware:
min_num_gpus_per_engine: 1 # Minimum GPUs to test
max_num_gpus_per_engine: 4 # Maximum GPUs to test (limited by model's num_heads/4)
num_gpus_per_node: 8 # GPUs per node (for MoE models)
minNumGpusPerEngine: 1 # Minimum GPUs to test
maxNumGpusPerEngine: 4 # Maximum GPUs to test (limited by model's num_heads/4)
numGpusPerNode: 8 # GPUs per node (for MoE models)
# Sweep/profiling configuration
sweep:
prefill_interpolation_granularity: 16 # Samples for TTFT interpolation
decode_interpolation_granularity: 6 # Samples for ITL interpolation
prefillInterpolationGranularity: 16 # Samples for TTFT interpolation
decodeInterpolationGranularity: 6 # Samples for ITL interpolation
# AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
use_ai_configurator: false # Set to false for online profiling (2-4 hours)
aic_system: h200_sxm # Target GPU system for AI Configurator
aic_hf_id: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator
aic_backend_version: "0.20.0" # Backend version for AI Configurator
useAiConfigurator: false # Set to false for online profiling (2-4 hours)
aicSystem: h200_sxm # Target GPU system for AI Configurator
aicHfId: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator
aicBackendVersion: "0.20.0" # Backend version for AI Configurator
# SLA targets for profiling
sla:
......@@ -65,8 +65,8 @@ spec:
# Optional: Planner-specific arguments
# planner:
# planner_min_endpoint: 2
# # Add any other planner args here (use hyphens or underscores)
# plannerMinEndpoint: 2
# # Add any other planner args here
# Reference to ConfigMap containing the DGD base config (disagg.yaml)
# The path to this file will be automatically set as engine.config
......
......@@ -116,6 +116,7 @@ const (
// Volume names
VolumeNameProfilingConfig = "profiling-config"
VolumeNameProfilingOutput = "profiling-output"
VolumeNameModelCache = "model-cache"
// Volume paths
ProfilingOutputPath = "/data"
......@@ -123,6 +124,7 @@ const (
ProfilingOutputFileMocker = "mocker_config_with_planner.yaml"
ProfilingConfigPath = "/config"
ProfilingConfigFile = "disagg.yaml"
DefaultModelCacheMountPath = "/opt/model-cache"
// Command line arguments
ArgModel = "--model"
......@@ -152,6 +154,7 @@ const (
MessageProfilingCheckFailed = "ProfilingCheckFailed"
MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s"
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
MessageModelCachePVCNotFound = "model cache PVC %s not found in namespace %s"
// Validation messages
ValidationErrorModelRequired = "model is required"
......@@ -163,6 +166,13 @@ const (
BackendVLLM = "vllm"
BackendSGLang = "sglang"
BackendTRTLLM = "trtllm"
// Profiling config field names
ConfigKeyDeployment = "deployment"
ConfigKeyModelCache = "modelCache"
ConfigKeyPVCName = "pvcName"
ConfigKeyPVCPath = "pvcPath"
ConfigKeyMountPath = "mountPath"
)
// shell script template for the output copier sidecar
......@@ -796,6 +806,10 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo
}
if sweep, ok := config["sweep"].(map[string]interface{}); ok {
// Check camelCase first (preferred), then snake_case (backwards compat)
if useAIC, exists := sweep["useAiConfigurator"].(bool); exists {
return !useAIC
}
if useAIC, exists := sweep["use_ai_configurator"].(bool); exists {
return !useAIC
}
......@@ -852,6 +866,23 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
}
}
// Validate model cache PVC if provided
modelCachePVC, _ := extractModelCachePVCConfig(dgdr)
if modelCachePVC != "" {
pvc := &corev1.PersistentVolumeClaim{}
err := r.Get(ctx, types.NamespacedName{
Name: modelCachePVC,
Namespace: dgdr.Namespace,
}, pvc)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf(MessageModelCachePVCNotFound, modelCachePVC, dgdr.Namespace)
}
return err
}
}
// The profiler will validate the rest of the configuration
return nil
}
......@@ -959,6 +990,17 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
})
}
// Add model cache PVC mount if configured in profilingConfig.config.deployment
modelCachePVC, modelCacheMountPath := extractModelCachePVCConfig(dgdr)
if modelCachePVC != "" {
logger.Info("Mounting model cache PVC to profiler pod", "pvc", modelCachePVC, "mountPath", modelCacheMountPath)
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameModelCache,
MountPath: modelCacheMountPath,
ReadOnly: true,
})
}
// Profiler args: pass the config as an inline YAML string via --profile-config
profilerArgs := []string{
"--profile-config", string(configYAML),
......@@ -1064,6 +1106,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
})
}
// Add model cache PVC volume if configured
if modelCachePVC != "" {
volumes = append(volumes, corev1.Volume{
Name: VolumeNameModelCache,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: modelCachePVC,
ReadOnly: true,
},
},
})
}
// Limit retries to prevent infinite loop
backoffLimit := int32(3)
......@@ -1193,6 +1248,41 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv
return configYAML, nil
}
// extractModelCachePVCConfig extracts model cache PVC settings from the profiling config.
// Returns (pvcName, mountPath) - both empty if not configured.
func extractModelCachePVCConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (string, string) {
if dgdr.Spec.ProfilingConfig.Config == nil {
return "", ""
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return "", ""
}
deployment, ok := config[ConfigKeyDeployment].(map[string]interface{})
if !ok {
return "", ""
}
modelCache, ok := deployment[ConfigKeyModelCache].(map[string]interface{})
if !ok {
return "", ""
}
pvcName, _ := modelCache[ConfigKeyPVCName].(string)
if pvcName == "" {
return "", ""
}
mountPath, _ := modelCache[ConfigKeyMountPath].(string)
if mountPath == "" {
mountPath = DefaultModelCacheMountPath
}
return pvcName, mountPath
}
// checkProfilingJobStatus checks if the profiling job has completed
func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) {
logger := log.FromContext(ctx)
......
......@@ -858,6 +858,36 @@ var _ = Describe("DGDR Helper Functions", func() {
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
})
It("Should return false for AI Configurator profiling (useAiConfigurator=true camelCase)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": true,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
})
It("Should return true for online profiling (useAiConfigurator=false camelCase)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": false,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
})
})
})
......
......@@ -63,13 +63,13 @@ profilingConfig:
config:
# Override hardware defaults if needed
hardware:
min_num_gpus_per_engine: 1
max_num_gpus_per_engine: 8
num_gpus_per_node: 8
minNumGpusPerEngine: 1
maxNumGpusPerEngine: 8
numGpusPerNode: 8
# Only needed when using AI Configurator (sweep.use_ai_configurator: true)
# Only needed when using AI Configurator (sweep.useAiConfigurator: true)
sweep:
aic_system: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
aicSystem: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
```
### Automatic GPU Discovery (Optional Feature)
......@@ -120,7 +120,7 @@ Profiles your model by creating real test deployments in Kubernetes and measurin
profilingConfig:
config:
sweep:
use_ai_configurator: false # Default
useAiConfigurator: false # Default
```
### AI Configurator Simulation
......@@ -138,11 +138,10 @@ Uses performance simulation to rapidly estimate optimal configurations without r
profilingConfig:
config:
sweep:
use_ai_configurator: true
aic:
system: h200_sxm # GPU system type
model_name: QWEN3_32B # AIC model identifier
backend_version: "0.20.0"
useAiConfigurator: true
aicSystem: h200_sxm # GPU system type
aicHfId: Qwen/Qwen3-32B # HuggingFace model ID
aicBackendVersion: "0.20.0"
```
**Supported Configurations:**
......@@ -290,8 +289,7 @@ spec:
config: # Profiler configuration
sla: { ... }
hardware: { ... }
sweep: { ... }
aic: { ... }
sweep: { ... } # AIC settings go here (aicSystem, aicHfId, etc.)
planner: { ... }
deploymentOverrides: # Optional
......@@ -326,16 +324,16 @@ Control GPU search space and constraints:
profilingConfig:
config:
hardware:
min_num_gpus_per_engine: 2 # if not provided, will automatically determine based on model and VRAM size
max_num_gpus_per_engine: 8 # Maximum GPUs to test
num_gpus_per_node: 8 # GPUs per node (for multi-node MoE)
gpu_type: h200_sxm # GPU type hint
minNumGpusPerEngine: 2 # if not provided, will automatically determine based on model and VRAM size
maxNumGpusPerEngine: 8 # Maximum GPUs to test
numGpusPerNode: 8 # GPUs per node (for multi-node MoE)
gpuType: h200_sxm # GPU type hint
```
**When to use:**
- **min_num_gpus_per_engine**: Skip small TP sizes if your model is large
- **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **num_gpus_per_node**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
- **minNumGpusPerEngine**: Skip small TP sizes if your model is large
- **maxNumGpusPerEngine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **numGpusPerNode**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
- **gpu_type**: Informational, auto-detected by controller
> [!TIP]
......@@ -349,17 +347,17 @@ Control profiling behavior:
profilingConfig:
config:
sweep:
use_ai_configurator: false # Use offline profiling (default: false)
prefill_interpolation_granularity: 16 # Samples for prefill TTFT curve
decode_interpolation_granularity: 6 # Samples for decode ITL curve
useAiConfigurator: false # Use offline profiling (default: false)
prefillInterpolationGranularity: 16 # Samples for prefill TTFT curve
decodeInterpolationGranularity: 6 # Samples for decode ITL curve
```
**Use cases:**
- **use_ai_configurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefill_interpolation_granularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
- **decode_interpolation_granularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.
- **useAiConfigurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefillInterpolationGranularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
- **decodeInterpolationGranularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.
### AI Configurator Configuration (Required if `use_ai_configurator: true`)
### AI Configurator Configuration (Required if `useAiConfigurator: true`)
Configure AI Configurator profiling mode:
......@@ -367,10 +365,10 @@ Configure AI Configurator profiling mode:
profilingConfig:
config:
sweep:
use_ai_configurator: true
aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
aic_hf_id: Qwen/Qwen3-32B # Huggingface model id
aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3
useAiConfigurator: true
aicSystem: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
aicHfId: Qwen/Qwen3-32B # Huggingface model id
aicBackendVersion: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3
```
**Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
......@@ -391,6 +389,27 @@ profilingConfig:
> [!NOTE]
> Planner arguments use `planner_` prefix. See planner documentation for full list.
### Model Cache PVC (Advanced)
For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. This is useful when:
- The model is not publicly available on HuggingFace
- You want to avoid repeated downloads during profiling
- You have a shared model cache across your cluster
```yaml
profilingConfig:
config:
deployment:
modelCache:
pvcName: "model-cache" # Name of PVC containing model weights (required)
pvcPath: "hub/models--deepseek-ai--DeepSeek-R1" # Subpath within PVC (optional)
mountPath: "/opt/model-cache" # Mount path in container (optional, default: /opt/model-cache)
```
**Requirements:**
- The PVC must exist in the same namespace as the DGDR
- The model weights must be accessible at `{mountPath}/{pvcPath}`
### Engine Configuration (Auto-configured)
The controller automatically sets these from high-level fields:
......@@ -434,11 +453,11 @@ spec:
itl: 20.0
hardware:
min_num_gpus_per_engine: 1
max_num_gpus_per_engine: 8
minNumGpusPerEngine: 1
maxNumGpusPerEngine: 8
sweep:
use_ai_configurator: false
useAiConfigurator: false
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
......@@ -467,12 +486,10 @@ spec:
itl: 10.0
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: QWEN3_32B
backend_version: "0.20.0"
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0"
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
......@@ -501,11 +518,11 @@ spec:
itl: 25.0
hardware:
num_gpus_per_node: 8
max_num_gpus_per_engine: 32
numGpusPerNode: 8
maxNumGpusPerEngine: 32
engine:
is_moe_model: true # Enable MoE profiling mode
isMoeModel: true # Enable MoE profiling mode
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
......@@ -520,15 +537,15 @@ spec:
**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
```yaml
sweep:
use_ai_configurator: true
useAiConfigurator: true
```
**Solution 2**: Reduce search space:
```yaml
config:
sweep:
min_num_gpus: 4 # Skip TP1, TP2
max_num_gpus: 8 # Don't test beyond TP8
minNumGpus: 4 # Skip TP1, TP2
maxNumGpus: 8 # Don't test beyond TP8
```
### SLA Cannot Be Met
......@@ -555,19 +572,18 @@ AssertionError: num_heads <N> should be divisible by tp_size <M> and the divisio
- **GPT-2** (12 heads): Max TP = 3
- Most models **<1B parameters**: May hit this constraint
**Solution**: Limit `max_num_gpus_per_engine` in your DGDR:
**Solution**: Limit `maxNumGpusPerEngine` in your DGDR:
```yaml
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
config:
hardware:
max_num_gpus_per_engine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
maxNumGpusPerEngine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: QWEN3_0_6B
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-0.6B
```
**Calculate Max TP**: `max_tp = num_attention_heads / 4`
......
......@@ -311,7 +311,7 @@ _Appears in:_
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false | |
| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,<br />num_gpus_per_node) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
......
......@@ -92,38 +92,10 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You
**Available Sample DGDRs:**
- **`profile_sla_dgdr.yaml`**: Standard online profiling for dense models
- **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator (TensorRT-LLM)
- **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator
- **`profile_sla_moe_dgdr.yaml`**: Online profiling for MoE models (SGLang)
Or, you can create your own DGDR for your own needs:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: my-model-deployment # Change the name
namespace: default # Change the namespace
spec:
model: "Qwen/Qwen3-0.6B" # Update to your model
backend: vllm # Backend: vllm, sglang, or trtllm
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Required
config:
sla:
isl: 3000 # Adjust to your workload
osl: 150 # Adjust to your workload
ttft: 200 # Your target (ms)
itl: 20 # Your target (ms)
sweep:
use_ai_configurator: false # Set to true for fast profiling (TensorRT-LLM only)
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Optional
autoApply: true # Auto-deploy after profiling
```
Or, you can create your own DGDR for your own needs.
> [!TIP]
> For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference).
......@@ -242,14 +214,14 @@ Choose between **online profiling** (real measurements, 2-4 hours) or **offline
```yaml
# Online Profiling (Default)
sweep:
use_ai_configurator: false
useAiConfigurator: false
# Offline Profiling (AI Configurator - TensorRT-LLM only)
# Offline Profiling (AI Configurator)
sweep:
use_ai_configurator: true
aic_system: h200_sxm
aic_hf_id: Qwen/Qwen3-32B
aic_backend_version: "0.20.0"
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0"
```
> [!NOTE]
......@@ -297,11 +269,10 @@ spec:
ttft: 300
itl: 10
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: DEEPSEEK_V3
backend_version: "0.20.0"
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: deepseek-ai/DeepSeek-V3
aicBackendVersion: "0.20.0"
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
......@@ -327,26 +298,26 @@ profilingConfig:
# Hardware constraints (optional)
hardware:
min_num_gpus_per_engine: 2
max_num_gpus_per_engine: 8
gpu_type: h200_sxm
minNumGpusPerEngine: 2
maxNumGpusPerEngine: 8
gpuType: h200_sxm
# Profiling sweep settings (optional)
sweep:
prefill_interpolation_granularity: 16 # Number of samples for prefill ISL sweep
decode_interpolation_granularity: 6 # Number of samples for decode sweep
prefillInterpolationGranularity: 16 # Number of samples for prefill ISL sweep
decodeInterpolationGranularity: 6 # Number of samples for decode sweep
```
> **Note**: `engine.config` is a **file path** to a DGD YAML file, not inline configuration. Use ConfigMapRef (recommended) or leave it unset to auto-generate.
#### Planner Configuration Passthrough
Add planner-specific settings. Planner arguments use a `planner_` prefix:
Add planner-specific settings:
```yaml
profilingConfig:
config:
planner:
planner_min_endpoint: 2
plannerMinEndpoint: 2
```
## Understanding Profiling Results
......@@ -378,6 +349,10 @@ spec:
Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior.
### Using a Model Cache PVC
For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
### DGDR Immutability
DGDRs are **immutable** - if you need to update SLAs or configuration:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment