Unverified Commit 359765d3 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: load-based scaling in SLA Planner (#6145)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 815b1291
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
spec:
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner:
componentType: planner
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
command:
- python3
- -m
- planner_sla
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
- --no-correction
VllmDecodeWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
...@@ -7,20 +7,24 @@ ...@@ -7,20 +7,24 @@
# 1. Deploys the disaggregated planner if not already running # 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000 # 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready # 3. Waits for the deployment to be ready
# 4. Runs the hardcoded scaling test (12 req/s -> 24 req/s) # 4. Runs the scaling test (8 req/s -> 18 req/s)
# 5. Cleans up # 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set -e set -e
# Configuration # Configuration
NAMESPACE=${NAMESPACE:-default} NAMESPACE=${NAMESPACE:-default}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py" TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
FRONTEND_PORT=8000 FRONTEND_PORT=8000
LOCAL_PORT=8000 LOCAL_PORT=8000
DEPLOYMENT_NAME="vllm-disagg-planner" DEPLOYMENT_NAME="vllm-disagg-planner"
SAVE_RESULTS=false SAVE_RESULTS=false
MODE="throughput"
# Colors for output # Colors for output
RED='\033[0;31m' RED='\033[0;31m'
...@@ -198,14 +202,14 @@ cleanup_deployment() { ...@@ -198,14 +202,14 @@ cleanup_deployment() {
} }
run_test() { run_test() {
log_info "Running scaling test (graduated 8->18 req/s)..." log_info "Running scaling test (graduated 8->18 req/s, mode=$MODE)..."
local python_cmd="python3" local python_cmd="python3"
if ! command -v python3 &> /dev/null; then if ! command -v python3 &> /dev/null; then
python_cmd="python" python_cmd="python"
fi fi
local test_args="--namespace $NAMESPACE" local test_args="--namespace $NAMESPACE --mode $MODE"
if [ "$SAVE_RESULTS" = true ]; then if [ "$SAVE_RESULTS" = true ]; then
test_args="$test_args --save-results" test_args="$test_args --save-results"
log_info "Results will be saved to tests/planner/e2e_scaling_results" log_info "Results will be saved to tests/planner/e2e_scaling_results"
...@@ -227,17 +231,26 @@ main() { ...@@ -227,17 +231,26 @@ main() {
NAMESPACE="$2" NAMESPACE="$2"
shift 2 shift 2
;; ;;
--mode)
MODE="$2"
if [[ "$MODE" != "throughput" && "$MODE" != "load" ]]; then
log_error "Invalid mode: $MODE (must be 'throughput' or 'load')"
exit 1
fi
shift 2
;;
--save-results) --save-results)
SAVE_RESULTS=true SAVE_RESULTS=true
shift shift
;; ;;
--help) --help)
echo "Usage: $0 [--namespace NS] [--save-results]" echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]"
echo "" echo ""
echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)" echo "Run SLA planner scaling test (graduated 8->18 req/s prefill scaling)"
echo "" echo ""
echo "Options:" echo "Options:"
echo " --namespace NS Kubernetes namespace (default: default)" echo " --namespace NS Kubernetes namespace (default: default)"
echo " --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo " --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp" echo " --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo " --help Show this help" echo " --help Show this help"
exit 0 exit 0
...@@ -250,8 +263,17 @@ main() { ...@@ -250,8 +263,17 @@ main() {
esac esac
done done
# Select YAML based on mode
if [ "$MODE" = "load" ]; then
YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml"
else
YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml"
fi
log_info "SLA Planner Scaling Test" log_info "SLA Planner Scaling Test"
log_info "Namespace: $NAMESPACE" log_info "Namespace: $NAMESPACE"
log_info "Mode: $MODE"
log_info "YAML: $YAML_FILE"
log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)" log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites check_prerequisites
......
...@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch ...@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
import pytest import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.planner_core import ( from dynamo.planner.utils.planner_core import (
DecodePlanner,
Metrics,
PlannerSharedState, PlannerSharedState,
PrefillPlanner,
_apply_global_gpu_budget, _apply_global_gpu_budget,
) )
from dynamo.planner.utils.prefill_planner import PrefillPlanner
from dynamo.planner.utils.prometheus import Metrics
pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0] pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0]
...@@ -78,7 +78,7 @@ class PlannerHarness: ...@@ -78,7 +78,7 @@ class PlannerHarness:
"isl_predictor", "isl_predictor",
"osl_predictor", "osl_predictor",
"connector", "connector",
"prometheus_api_client", "prometheus_traffic_client",
"args", "args",
} }
prefill_attrs = { prefill_attrs = {
...@@ -111,7 +111,7 @@ class PlannerHarness: ...@@ -111,7 +111,7 @@ class PlannerHarness:
"isl_predictor", "isl_predictor",
"osl_predictor", "osl_predictor",
"connector", "connector",
"prometheus_api_client", "prometheus_traffic_client",
"args", "args",
"get_workers_info", "get_workers_info",
} }
...@@ -194,7 +194,7 @@ def planner(): ...@@ -194,7 +194,7 @@ def planner():
planner.connector = Mock() planner.connector = Mock()
# Mock prometheus client # Mock prometheus client
planner.prometheus_api_client = Mock() planner.prometheus_traffic_client = Mock()
# Set up some baseline correction factors # Set up some baseline correction factors
planner.p_correction_factor = 1.0 planner.p_correction_factor = 1.0
......
...@@ -261,10 +261,12 @@ class ScalingE2ETest: ...@@ -261,10 +261,12 @@ class ScalingE2ETest:
namespace: str = "default", namespace: str = "default",
base_url: str = "http://localhost:8000", base_url: str = "http://localhost:8000",
save_results: bool = False, save_results: bool = False,
mode: str = "throughput",
): ):
self.namespace = namespace self.namespace = namespace
self.base_url = base_url self.base_url = base_url
self.save_results = save_results self.save_results = save_results
self.mode = mode
self.k8s_monitor = KubernetesMonitor(namespace) self.k8s_monitor = KubernetesMonitor(namespace)
self.load_generator = LoadGenerator( self.load_generator = LoadGenerator(
...@@ -281,7 +283,7 @@ class ScalingE2ETest: ...@@ -281,7 +283,7 @@ class ScalingE2ETest:
- Phase 1 (8 req/s): Should maintain 1P1D - Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D - Phase 2 (18 req/s): Should scale to 2P1D
""" """
logger.info("Starting scaling integration test") logger.info(f"Starting scaling integration test (mode={self.mode})")
test_start_time = time.time() test_start_time = time.time()
...@@ -291,8 +293,12 @@ class ScalingE2ETest: ...@@ -291,8 +293,12 @@ class ScalingE2ETest:
# Start background monitoring # Start background monitoring
# Calculate based on actual phases from load generator # Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer if self.mode == "load":
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION # Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 120 + 30 + 120 + BUFFER_DURATION
else:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
monitoring_task = asyncio.create_task( monitoring_task = asyncio.create_task(
self.k8s_monitor.monitor_scaling( self.k8s_monitor.monitor_scaling(
total_test_duration, interval=MONITORING_INTERVAL total_test_duration, interval=MONITORING_INTERVAL
...@@ -305,8 +311,10 @@ class ScalingE2ETest: ...@@ -305,8 +311,10 @@ class ScalingE2ETest:
try: try:
# Use the load generator's built-in scaling test # Use the load generator's built-in scaling test
logger.info("Running scaling scenario (8 req/s -> 18 req/s)") logger.info(
load_results = await self.load_generator.run_scaling_test() f"Running scaling scenario (8 req/s -> 18 req/s, mode={self.mode})"
)
load_results = await self.load_generator.run_scaling_test(mode=self.mode)
# Extract load results for analysis (2-phase structure) # Extract load results for analysis (2-phase structure)
phase_results = load_results.get("phase_results", {}) phase_results = load_results.get("phase_results", {})
...@@ -475,12 +483,20 @@ async def main(): ...@@ -475,12 +483,20 @@ async def main():
action="store_true", action="store_true",
help="Save results to tests/planner/e2e_scaling_results instead of /tmp", help="Save results to tests/planner/e2e_scaling_results instead of /tmp",
) )
# No additional arguments needed - test is hardcoded parser.add_argument(
"--mode",
choices=["throughput", "load"],
default="throughput",
help="Scaling mode to test: throughput (default) or load",
)
args = parser.parse_args() args = parser.parse_args()
test = ScalingE2ETest( test = ScalingE2ETest(
namespace=args.namespace, base_url=args.base_url, save_results=args.save_results namespace=args.namespace,
base_url=args.base_url,
save_results=args.save_results,
mode=args.mode,
) )
try: try:
......
This diff is collapsed.
...@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch ...@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
import pytest import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.exceptions import DeploymentValidationError from dynamo.planner.utils.exceptions import DeploymentValidationError
from dynamo.planner.utils.planner_core import ( from dynamo.planner.utils.planner_core import PlannerSharedState, _initialize_gpu_counts
DecodePlanner, from dynamo.planner.utils.prefill_planner import PrefillPlanner
PlannerSharedState,
PrefillPlanner,
_initialize_gpu_counts,
)
pytestmark = [ pytestmark = [
pytest.mark.gpu_0, pytest.mark.gpu_0,
...@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client): ...@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
shared_state = PlannerSharedState() shared_state = PlannerSharedState()
prefill_planner = PrefillPlanner(None, args, shared_state=shared_state) prefill_planner = PrefillPlanner(None, args, shared_state=shared_state)
decode_planner = DecodePlanner(None, args, shared_state=shared_state) decode_planner = DecodePlanner(None, args, shared_state=shared_state)
prefill_planner.prometheus_api_client = prometheus_client prefill_planner.prometheus_traffic_client = prometheus_client
decode_planner.prometheus_api_client = prometheus_client decode_planner.prometheus_traffic_client = prometheus_client
prefill_planner.model_name = "test-model" prefill_planner.model_name = "test-model"
decode_planner.model_name = "test-model" decode_planner.model_name = "test-model"
...@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample): ...@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
def _run_interval(prefill_planner, decode_planner, shared_state): def _run_interval(prefill_planner, decode_planner, shared_state):
asyncio.run( asyncio.run(
prefill_planner.observe_metrics(require_prefill=True, require_decode=True) prefill_planner.observe_traffic_stats(require_prefill=True, require_decode=True)
) )
decode_planner.update_predictors_from_metrics(shared_state.last_metrics) decode_planner.update_predictors_from_metrics(shared_state.last_metrics)
next_num_p = prefill_planner.plan_adjustment() next_num_p = prefill_planner.plan_adjustment()
......
...@@ -230,7 +230,7 @@ class LoadGenerator: ...@@ -230,7 +230,7 @@ class LoadGenerator:
logger.warning(f"Failed to parse aiperf results: {e}") logger.warning(f"Failed to parse aiperf results: {e}")
return {} return {}
async def run_scaling_test(self) -> Dict[str, Any]: async def run_scaling_test(self, mode: str = "throughput") -> Dict[str, Any]:
""" """
Run a graduated scaling test for prefill scaling. Run a graduated scaling test for prefill scaling.
...@@ -238,17 +238,23 @@ class LoadGenerator: ...@@ -238,17 +238,23 @@ class LoadGenerator:
- Phase 1: 8 req/s (baseline, should maintain 1P1D) - Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D) - Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Args:
mode: Scaling mode - "throughput" or "load".
"load" uses a longer baseline for regression warmup.
Returns: Returns:
Dictionary with complete test results Dictionary with complete test results
""" """
logger.info( logger.info(
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)" f"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D, mode={mode})"
) )
logger.info("Using conservative graduated approach with metric generation") logger.info("Using conservative graduated approach with metric generation")
# Graduated test parameters (optimized for prefill scaling) # Graduated test parameters (optimized for prefill scaling)
# Load-based scaling needs longer baseline for regression warmup
baseline_duration = 120 if mode == "load" else 90
phases: List[Dict[str, Any]] = [ phases: List[Dict[str, Any]] = [
{"rate": 8.0, "duration": 90, "name": "baseline"}, {"rate": 8.0, "duration": baseline_duration, "name": "baseline"},
{"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"}, {"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
] ]
transition_delay = 30 transition_delay = 30
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment