Unverified Commit 65fa037d authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: update directory structure in script and simplify planner scaling test (#3049)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 38300f22
......@@ -168,7 +168,7 @@ Test complete scaling behavior including Kubernetes deployment and load generati
With custom namespace:
```bash
./scaling/run_scaling_test.sh --namespace production
./scaling/run_scaling_test.sh --namespace <namespace>
```
To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
......@@ -186,8 +186,7 @@ To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D configuration:
- **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D)
- **Phase 2**: 15 req/s for 120s (moderate load - maintains 1P1D)
- **Phase 3**: 25 req/s for 180s (scaling trigger - scales to 2P1D)
- **Phase 2**: 18 req/s for 120s (scaling trigger - scales to 2P1D)
- **ISL/OSL**: 4000/150 tokens (optimized for prefill bottleneck)
- **Transition delay**: 30s between phases
- **Total test duration**: ~7 minutes + scaling observation
......
......@@ -22,9 +22,7 @@ spec:
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
args:
- "python3 -m dynamo.frontend --http-port 8000"
image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
Planner:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
......@@ -47,7 +45,7 @@ spec:
failureThreshold: 10
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
workingDir: /workspace/components/planner/src/dynamo/planner
ports:
- name: metrics
......@@ -66,31 +64,36 @@ spec:
--ttft=0.1
--itl=0.01
--load-predictor=constant
--no-correction
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner
componentType: prometheus
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
value: "/workspace/components/planner/src"
livenessProbe:
httpGet:
path: /
port: 9090
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
httpGet:
path: /
port: 9090
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......@@ -113,7 +116,7 @@ spec:
port: 9090
periodSeconds: 30
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......@@ -136,7 +139,7 @@ spec:
port: 9090
periodSeconds: 30
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......
......@@ -14,7 +14,9 @@ set -e
# Configuration
NAMESPACE=${NAMESPACE:-default}
YAML_FILE="disagg_planner.yaml"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
FRONTEND_PORT=8000
LOCAL_PORT=8000
DEPLOYMENT_NAME="vllm-disagg-planner"
......@@ -194,7 +196,7 @@ cleanup_deployment() {
}
run_test() {
log_info "Running scaling test (graduated 8->15->25 req/s)..."
log_info "Running scaling test (graduated 8->18 req/s)..."
local python_cmd="python3"
if ! command -v python3 &> /dev/null; then
......@@ -207,7 +209,7 @@ run_test() {
log_info "Results will be saved to tests/planner/e2e_scaling_results"
fi
if $python_cmd test_scaling_e2e.py $test_args; then
if $python_cmd "$TEST_FILE" $test_args; then
log_success "Scaling test PASSED"
return 0
else
......@@ -248,7 +250,7 @@ main() {
log_info "SLA Planner Scaling Test"
log_info "Namespace: $NAMESPACE"
log_info "Scenario: Graduated 8->15->25 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites
......
......@@ -62,6 +62,8 @@ def planner():
"profiling_results/H200_TP1P_TP1D",
)
args.environment = "kubernetes"
args.namespace = "test-namespace" # Required for Planner.__init__
args.no_correction = False # Required for Planner.__init__
# Mock the runtime
mock_runtime = Mock()
......
......@@ -276,8 +276,8 @@ class ScalingE2ETest:
Run the complete scaling test.
Hardcoded scenario:
- Phase 1 (12 req/s): Should maintain 1P1D
- Phase 2 (24 req/s): Should scale to 2P1D
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
logger.info("Starting scaling integration test")
......@@ -289,8 +289,8 @@ class ScalingE2ETest:
# Start background monitoring
# Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + moderate(120s) + transition(30s) + trigger(180s) + buffer
total_test_duration = 90 + 30 + 120 + 30 + 180 + BUFFER_DURATION
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
monitoring_task = asyncio.create_task(
self.k8s_monitor.monitor_scaling(
total_test_duration, interval=MONITORING_INTERVAL
......@@ -299,19 +299,17 @@ class ScalingE2ETest:
# Initialize results in case of exception
baseline_results = {}
moderate_results = {}
trigger_results = {}
try:
# Use the load generator's built-in scaling test
logger.info("Running scaling scenario (8 req/s -> 15 req/s -> 25 req/s)")
logger.info("Running scaling scenario (8 req/s -> 18 req/s)")
load_results = await self.load_generator.run_scaling_test()
# Extract load results for analysis (3-phase structure)
# Extract load results for analysis (2-phase structure)
phase_results = load_results.get("phase_results", {})
baseline_results = phase_results.get("phase1_baseline", {})
moderate_results = phase_results.get("phase2_moderate", {})
trigger_results = phase_results.get("phase3_prefill_scaling_trigger", {})
trigger_results = phase_results.get("phase2_prefill_scaling_trigger", {})
# Check final pod counts
final_counts = self.k8s_monitor.get_pod_counts()
......@@ -342,14 +340,12 @@ class ScalingE2ETest:
"config": {
# Document actual test configuration
"baseline_rps": 8.0,
"moderate_rps": 15.0,
"trigger_rps": 25.0,
"phase_durations": {"baseline": 90, "moderate": 120, "trigger": 180},
"trigger_rps": 18.0,
"phase_durations": {"baseline": 90, "trigger": 120},
"transition_delay": 30,
},
"initial_pod_counts": initial_counts.__dict__ if initial_counts else None,
"baseline_results": baseline_results,
"moderate_results": moderate_results,
"trigger_results": trigger_results,
"final_pod_counts": final_counts.__dict__ if final_counts else None,
"final_final_pod_counts": final_final_counts.__dict__
......@@ -453,13 +449,10 @@ class ScalingE2ETest:
# Add performance validation across all phases
baseline = results.get("baseline_results", {})
moderate = results.get("moderate_results", {})
trigger = results.get("trigger_results", {})
if baseline.get("throughput", 0) > 0:
validation["baseline_throughput"] = f"{baseline['throughput']:.2f} req/s"
if moderate.get("throughput", 0) > 0:
validation["moderate_throughput"] = f"{moderate['throughput']:.2f} req/s"
if trigger.get("throughput", 0) > 0:
validation["trigger_throughput"] = f"{trigger['throughput']:.2f} req/s"
......
......@@ -131,10 +131,14 @@ class LoadGenerator:
]
logger.info(f"Running command: {' '.join(cmd)}")
logger.info(
f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s"
)
# Run genai-perf (async)
start_time = time.time()
timeout = max(duration_sec + 60, int(duration_sec * 1.5))
# More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer
timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5))
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
......@@ -251,12 +255,11 @@ class LoadGenerator:
async def run_scaling_test(self) -> Dict[str, Any]:
"""
Run a multi-phase graduated scaling test for prefill scaling.
Run a graduated scaling test for prefill scaling.
Uses a conservative graduated approach:
- Phase 1: 5 req/s (baseline, should work)
- Phase 2: 10 req/s (moderate load)
- Phase 3: 18 req/s (should trigger prefill scaling to 2P1D)
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Returns:
Dictionary with complete test results
......@@ -269,8 +272,7 @@ class LoadGenerator:
# Graduated test parameters (optimized for prefill scaling)
phases: List[Dict[str, Any]] = [
{"rate": 8.0, "duration": 90, "name": "baseline"},
{"rate": 15.0, "duration": 120, "name": "moderate"},
{"rate": 25.0, "duration": 180, "name": "prefill_scaling_trigger"},
{"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
]
transition_delay = 30
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment