Unverified Commit 359765d3 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: load-based scaling in SLA Planner (#6145)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 815b1291
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
spec:
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner:
componentType: planner
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
command:
- python3
- -m
- planner_sla
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
- --no-correction
VllmDecodeWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
......@@ -7,20 +7,24 @@
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 4. Runs the hardcoded scaling test (12 req/s -> 24 req/s)
# 4. Runs the scaling test (8 req/s -> 18 req/s)
# 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set -e
# Configuration
NAMESPACE=${NAMESPACE:-default}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
FRONTEND_PORT=8000
LOCAL_PORT=8000
DEPLOYMENT_NAME="vllm-disagg-planner"
SAVE_RESULTS=false
MODE="throughput"
# Colors for output
RED='\033[0;31m'
......@@ -198,14 +202,14 @@ cleanup_deployment() {
}
run_test() {
log_info "Running scaling test (graduated 8->18 req/s)..."
log_info "Running scaling test (graduated 8->18 req/s, mode=$MODE)..."
local python_cmd="python3"
if ! command -v python3 &> /dev/null; then
python_cmd="python"
fi
local test_args="--namespace $NAMESPACE"
local test_args="--namespace $NAMESPACE --mode $MODE"
if [ "$SAVE_RESULTS" = true ]; then
test_args="$test_args --save-results"
log_info "Results will be saved to tests/planner/e2e_scaling_results"
......@@ -227,17 +231,26 @@ main() {
NAMESPACE="$2"
shift 2
;;
--mode)
MODE="$2"
if [[ "$MODE" != "throughput" && "$MODE" != "load" ]]; then
log_error "Invalid mode: $MODE (must be 'throughput' or 'load')"
exit 1
fi
shift 2
;;
--save-results)
SAVE_RESULTS=true
shift
;;
--help)
echo "Usage: $0 [--namespace NS] [--save-results]"
echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]"
echo ""
echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
echo "Run SLA planner scaling test (graduated 8->18 req/s prefill scaling)"
echo ""
echo "Options:"
echo " --namespace NS Kubernetes namespace (default: default)"
echo " --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo " --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo " --help Show this help"
exit 0
......@@ -250,8 +263,17 @@ main() {
esac
done
# Select YAML based on mode
if [ "$MODE" = "load" ]; then
YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml"
else
YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml"
fi
log_info "SLA Planner Scaling Test"
log_info "Namespace: $NAMESPACE"
log_info "Mode: $MODE"
log_info "YAML: $YAML_FILE"
log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites
......
......@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.planner_core import (
DecodePlanner,
Metrics,
PlannerSharedState,
PrefillPlanner,
_apply_global_gpu_budget,
)
from dynamo.planner.utils.prefill_planner import PrefillPlanner
from dynamo.planner.utils.prometheus import Metrics
pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0]
......@@ -78,7 +78,7 @@ class PlannerHarness:
"isl_predictor",
"osl_predictor",
"connector",
"prometheus_api_client",
"prometheus_traffic_client",
"args",
}
prefill_attrs = {
......@@ -111,7 +111,7 @@ class PlannerHarness:
"isl_predictor",
"osl_predictor",
"connector",
"prometheus_api_client",
"prometheus_traffic_client",
"args",
"get_workers_info",
}
......@@ -194,7 +194,7 @@ def planner():
planner.connector = Mock()
# Mock prometheus client
planner.prometheus_api_client = Mock()
planner.prometheus_traffic_client = Mock()
# Set up some baseline correction factors
planner.p_correction_factor = 1.0
......
......@@ -261,10 +261,12 @@ class ScalingE2ETest:
namespace: str = "default",
base_url: str = "http://localhost:8000",
save_results: bool = False,
mode: str = "throughput",
):
self.namespace = namespace
self.base_url = base_url
self.save_results = save_results
self.mode = mode
self.k8s_monitor = KubernetesMonitor(namespace)
self.load_generator = LoadGenerator(
......@@ -281,7 +283,7 @@ class ScalingE2ETest:
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
logger.info("Starting scaling integration test")
logger.info(f"Starting scaling integration test (mode={self.mode})")
test_start_time = time.time()
......@@ -291,8 +293,12 @@ class ScalingE2ETest:
# Start background monitoring
# Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
if self.mode == "load":
# Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 120 + 30 + 120 + BUFFER_DURATION
else:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
monitoring_task = asyncio.create_task(
self.k8s_monitor.monitor_scaling(
total_test_duration, interval=MONITORING_INTERVAL
......@@ -305,8 +311,10 @@ class ScalingE2ETest:
try:
# Use the load generator's built-in scaling test
logger.info("Running scaling scenario (8 req/s -> 18 req/s)")
load_results = await self.load_generator.run_scaling_test()
logger.info(
f"Running scaling scenario (8 req/s -> 18 req/s, mode={self.mode})"
)
load_results = await self.load_generator.run_scaling_test(mode=self.mode)
# Extract load results for analysis (2-phase structure)
phase_results = load_results.get("phase_results", {})
......@@ -475,12 +483,20 @@ async def main():
action="store_true",
help="Save results to tests/planner/e2e_scaling_results instead of /tmp",
)
# No additional arguments needed - test is hardcoded
parser.add_argument(
"--mode",
choices=["throughput", "load"],
default="throughput",
help="Scaling mode to test: throughput (default) or load",
)
args = parser.parse_args()
test = ScalingE2ETest(
namespace=args.namespace, base_url=args.base_url, save_results=args.save_results
namespace=args.namespace,
base_url=args.base_url,
save_results=args.save_results,
mode=args.mode,
)
try:
......
This diff is collapsed.
......@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.exceptions import DeploymentValidationError
from dynamo.planner.utils.planner_core import (
DecodePlanner,
PlannerSharedState,
PrefillPlanner,
_initialize_gpu_counts,
)
from dynamo.planner.utils.planner_core import PlannerSharedState, _initialize_gpu_counts
from dynamo.planner.utils.prefill_planner import PrefillPlanner
pytestmark = [
pytest.mark.gpu_0,
......@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
shared_state = PlannerSharedState()
prefill_planner = PrefillPlanner(None, args, shared_state=shared_state)
decode_planner = DecodePlanner(None, args, shared_state=shared_state)
prefill_planner.prometheus_api_client = prometheus_client
decode_planner.prometheus_api_client = prometheus_client
prefill_planner.prometheus_traffic_client = prometheus_client
decode_planner.prometheus_traffic_client = prometheus_client
prefill_planner.model_name = "test-model"
decode_planner.model_name = "test-model"
......@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
def _run_interval(prefill_planner, decode_planner, shared_state):
asyncio.run(
prefill_planner.observe_metrics(require_prefill=True, require_decode=True)
prefill_planner.observe_traffic_stats(require_prefill=True, require_decode=True)
)
decode_planner.update_predictors_from_metrics(shared_state.last_metrics)
next_num_p = prefill_planner.plan_adjustment()
......
......@@ -230,7 +230,7 @@ class LoadGenerator:
logger.warning(f"Failed to parse aiperf results: {e}")
return {}
async def run_scaling_test(self) -> Dict[str, Any]:
async def run_scaling_test(self, mode: str = "throughput") -> Dict[str, Any]:
"""
Run a graduated scaling test for prefill scaling.
......@@ -238,17 +238,23 @@ class LoadGenerator:
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Args:
mode: Scaling mode - "throughput" or "load".
"load" uses a longer baseline for regression warmup.
Returns:
Dictionary with complete test results
"""
logger.info(
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)"
f"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D, mode={mode})"
)
logger.info("Using conservative graduated approach with metric generation")
# Graduated test parameters (optimized for prefill scaling)
# Load-based scaling needs longer baseline for regression warmup
baseline_duration = 120 if mode == "load" else 90
phases: List[Dict[str, Any]] = [
{"rate": 8.0, "duration": 90, "name": "baseline"},
{"rate": 8.0, "duration": baseline_duration, "name": "baseline"},
{"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
]
transition_delay = 30
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment