help="Maximum GPU budget (-1 for no budget enforcement)",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=SLAPlannerDefaults.min_endpoint,
help="Minimum number of endpoints",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=None,
help="Number of GPUs per decode engine. In Kubernetes mode, this is auto-detected "
"from DGD resources but can be overridden (e.g., for mockers without GPU resources).",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=None,
help="Number of GPUs per prefill engine. In Kubernetes mode, this is auto-detected "
"from DGD resources but can be overridden (e.g., for mockers without GPU resources).",
)
parser.add_argument(
"--profile-results-dir",
default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory or 'use-pre-swept-results:<gpu_type>:<framework>:<model>:<tp>:<dp>:<pp>:<block_size>:<max_batch_size>:<gpu_count>' to use pre-swept results from pre_swept_results directory",
)
parser.add_argument(
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token (float, in milliseconds)",
)
parser.add_argument(
"--itl",
type=float,
default=SLAPlannerDefaults.itl,
help="Inter-token latency (float, in milliseconds)",
)
parser.add_argument(
"--load-predictor",
default=SLAPlannerDefaults.load_predictor,
help="Load predictor type (constant, arima, kalman, prophet)",
)
parser.add_argument(
"--load-predictor-log1p",
action="store_true",
default=SLAPlannerDefaults.load_predictor_log1p,
help="Model log1p(y) instead of y in the selected load predictor (ARIMA/Kalman/Prophet)",
)
parser.add_argument(
"--prophet-window-size",
type=int,
default=SLAPlannerDefaults.prophet_window_size,
help="Prophet history window size",
)
parser.add_argument(
"--load-predictor-warmup-trace",
type=str,
default=None,
help="Optional path to a mooncake-style JSONL trace file used to warm up load predictors before observing live traffic",
)
parser.add_argument(
"--kalman-q-level",
type=float,
default=SLAPlannerDefaults.kalman_q_level,
help="Kalman process noise for level (higher = more responsive)",
)
parser.add_argument(
"--kalman-q-trend",
type=float,
default=SLAPlannerDefaults.kalman_q_trend,
help="Kalman process noise for trend (higher = faster trend changes)",
)
parser.add_argument(
"--kalman-r",
type=float,
default=SLAPlannerDefaults.kalman_r,
help="Kalman measurement noise (lower = remember less / react more to new measurements)",
)
parser.add_argument(
"--kalman-min-points",
type=int,
default=SLAPlannerDefaults.kalman_min_points,
help="Minimum number of points before Kalman predictor returns forecasts",
description='PlannerPreDeploymentSweeping controls pre-deployment sweeping mode for planner in-depth profiling. "none" means no pre-deployment sweep (only load-based scaling). "rapid" uses AI Configurator to simulate engine performance. "thorough" uses real GPUs to measure engine performance (takes several hours).',
"description":"Pydantic configuration for the Dynamo Planner.\n\nReplaces the argparse-based CLI. All fields mirror the former CLI flags\nwith defaults sourced from SLAPlannerDefaults.",
"properties":{
"plannerPreDeploymentSweeping":{
"anyOf":[
{
"$ref":"#/$defs/PlannerPreDeploymentSweepMode"
},
{
"type":"null"
}
],
"default":"rapid",
"description":"PlannerPreDeploymentSweeping controls pre-deployment sweeping mode for planner in-depth profiling. \"none\" means no pre-deployment sweep (only load-based scaling). \"rapid\" uses AI Configurator to simulate engine performance. \"thorough\" uses real GPUs to measure engine performance (takes several hours)."