feat: update directory structure in script and simplify planner scaling test (#3049)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: update directory structure in script and simplify planner scaling test (#3049)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
65fa037d · hhzhang16 · GitHub · 38300f22 · 65fa037d · 65fa037d
Unverified Commit 65fa037d authored Sep 16, 2025 by hhzhang16 Committed by GitHub Sep 16, 2025
6 changed files
--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -168,7 +168,7 @@ Test complete scaling behavior including Kubernetes deployment and load generati

 With custom namespace:
 ```bash
-./scaling/run_scaling_test.sh --namespace production
+./scaling/run_scaling_test.sh --namespace <namespace>
 ```

 To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
@@ -186,8 +186,7 @@ To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
 The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D configuration:

 - **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D)
- **Phase 2**: 15 req/s for 120s (moderate load - maintains 1P1D)
- **Phase 3**: 25 req/s for 180s (scaling trigger - scales to 2P1D)
+- **Phase 2**: 18 req/s for 120s (scaling trigger - scales to 2P1D)
 - **ISL/OSL**: 4000/150 tokens (optimized for prefill bottleneck)
 - **Transition delay**: 30s between phases
 - **Total test duration**: ~7 minutes + scaling observation

--- a/tests/planner/scaling/disagg_planner.yaml
+++ b/tests/planner/scaling/disagg_planner.yaml
@@ -22,9 +22,7 @@ spec:
      replicas: 1
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
-          args:
-            - "python3 -m dynamo.frontend --http-port 8000"
+          image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
    Planner:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
@@ -47,7 +45,7 @@ spec:
        failureThreshold: 10
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
+          image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
          workingDir: /workspace/components/planner/src/dynamo/planner
          ports:
            - name: metrics
@@ -66,31 +64,36 @@ spec:
              --ttft=0.1
              --itl=0.01
              --load-predictor=constant
+              --no-correction
    Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
      dynamoNamespace: vllm-disagg-planner
-      componentType: prometheus
+      componentType: frontend
      replicas: 1
      envs:
        - name: PYTHONPATH
          value: "/workspace/components/planner/src"
      livenessProbe:
-        httpGet:
-          path: /
-          port: 9090
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
-        httpGet:
-          path: /
-          port: 9090
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
        initialDelaySeconds: 30
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
+          image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -113,7 +116,7 @@ spec:
              port: 9090
            periodSeconds: 30
            failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
+          image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -136,7 +139,7 @@ spec:
              port: 9090
            periodSeconds: 30
            failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-301.6
+          image: nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-301.6
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/tests/planner/scaling/run_scaling_test.sh
+++ b/tests/planner/scaling/run_scaling_test.sh
@@ -14,7 +14,9 @@ set -e

 # Configuration
 NAMESPACE=${NAMESPACE:-default}
-YAML_FILE="disagg_planner.yaml"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
+TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
 FRONTEND_PORT=8000
 LOCAL_PORT=8000
 DEPLOYMENT_NAME="vllm-disagg-planner"
@@ -194,7 +196,7 @@ cleanup_deployment() {
 }

 run_test() {
-    log_info "Running scaling test (graduated 8->15->25 req/s)..."
+    log_info "Running scaling test (graduated 8->18 req/s)..."

    local python_cmd="python3"
    if ! command -v python3 &> /dev/null; then
@@ -207,7 +209,7 @@ run_test() {
        log_info "Results will be saved to tests/planner/e2e_scaling_results"
    fi

-    if $python_cmd test_scaling_e2e.py $test_args; then
+    if $python_cmd "$TEST_FILE" $test_args; then
        log_success "Scaling test PASSED"
        return 0
    else
@@ -248,7 +250,7 @@ main() {

    log_info "SLA Planner Scaling Test"
    log_info "Namespace: $NAMESPACE"
-    log_info "Scenario: Graduated 8->15->25 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
+    log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"

    check_prerequisites


--- a/tests/planner/test_replica_calculation.py
+++ b/tests/planner/test_replica_calculation.py
@@ -62,6 +62,8 @@ def planner():
        "profiling_results/H200_TP1P_TP1D",
    )
    args.environment = "kubernetes"
+    args.namespace = "test-namespace"  # Required for Planner.__init__
+    args.no_correction = False  # Required for Planner.__init__

    # Mock the runtime
    mock_runtime = Mock()

--- a/tests/planner/test_scaling_e2e.py
+++ b/tests/planner/test_scaling_e2e.py
@@ -276,8 +276,8 @@ class ScalingE2ETest:
        Run the complete scaling test.

        Hardcoded scenario:
-        - Phase 1 (12 req/s): Should maintain 1P1D
-        - Phase 2 (24 req/s): Should scale to 2P1D
+        - Phase 1 (8 req/s): Should maintain 1P1D
+        - Phase 2 (18 req/s): Should scale to 2P1D
        """
        logger.info("Starting scaling integration test")

@@ -289,8 +289,8 @@ class ScalingE2ETest:

        # Start background monitoring
        # Calculate based on actual phases from load generator
-        # Phase durations: baseline(90s) + transition(30s) + moderate(120s) + transition(30s) + trigger(180s) + buffer
-        total_test_duration = 90 + 30 + 120 + 30 + 180 + BUFFER_DURATION
+        # Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
+        total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
        monitoring_task = asyncio.create_task(
            self.k8s_monitor.monitor_scaling(
                total_test_duration, interval=MONITORING_INTERVAL
@@ -299,19 +299,17 @@ class ScalingE2ETest:

        # Initialize results in case of exception
        baseline_results = {}
-        moderate_results = {}
        trigger_results = {}

        try:
            # Use the load generator's built-in scaling test
-            logger.info("Running scaling scenario (8 req/s -> 15 req/s -> 25 req/s)")
+            logger.info("Running scaling scenario (8 req/s -> 18 req/s)")
            load_results = await self.load_generator.run_scaling_test()

-            # Extract load results for analysis (3-phase structure)
+            # Extract load results for analysis (2-phase structure)
            phase_results = load_results.get("phase_results", {})
            baseline_results = phase_results.get("phase1_baseline", {})
-            moderate_results = phase_results.get("phase2_moderate", {})
-            trigger_results = phase_results.get("phase3_prefill_scaling_trigger", {})
+            trigger_results = phase_results.get("phase2_prefill_scaling_trigger", {})

            # Check final pod counts
            final_counts = self.k8s_monitor.get_pod_counts()
@@ -342,14 +340,12 @@ class ScalingE2ETest:
            "config": {
                # Document actual test configuration
                "baseline_rps": 8.0,
-                "moderate_rps": 15.0,
-                "trigger_rps": 25.0,
-                "phase_durations": {"baseline": 90, "moderate": 120, "trigger": 180},
+                "trigger_rps": 18.0,
+                "phase_durations": {"baseline": 90, "trigger": 120},
                "transition_delay": 30,
            },
            "initial_pod_counts": initial_counts.__dict__ if initial_counts else None,
            "baseline_results": baseline_results,
-            "moderate_results": moderate_results,
            "trigger_results": trigger_results,
            "final_pod_counts": final_counts.__dict__ if final_counts else None,
            "final_final_pod_counts": final_final_counts.__dict__
@@ -453,13 +449,10 @@ class ScalingE2ETest:

        # Add performance validation across all phases
        baseline = results.get("baseline_results", {})
-        moderate = results.get("moderate_results", {})
        trigger = results.get("trigger_results", {})

        if baseline.get("throughput", 0) > 0:
            validation["baseline_throughput"] = f"{baseline['throughput']:.2f} req/s"
-        if moderate.get("throughput", 0) > 0:
-            validation["moderate_throughput"] = f"{moderate['throughput']:.2f} req/s"
        if trigger.get("throughput", 0) > 0:
            validation["trigger_throughput"] = f"{trigger['throughput']:.2f} req/s"


--- a/tests/planner/utils/load_generator.py
+++ b/tests/planner/utils/load_generator.py
@@ -131,10 +131,14 @@ class LoadGenerator:
        ]

        logger.info(f"Running command: {' '.join(cmd)}")
+        logger.info(
+            f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s"
+        )

        # Run genai-perf (async)
        start_time = time.time()
-        timeout = max(duration_sec + 60, int(duration_sec * 1.5))
+        # More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer
+        timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5))
        try:
            proc = await asyncio.create_subprocess_exec(
                *cmd,
@@ -251,12 +255,11 @@ class LoadGenerator:

    async def run_scaling_test(self) -> Dict[str, Any]:
        """
-        Run a multi-phase graduated scaling test for prefill scaling.
+        Run a graduated scaling test for prefill scaling.

        Uses a conservative graduated approach:
-        - Phase 1: 5 req/s (baseline, should work)
-        - Phase 2: 10 req/s (moderate load)
-        - Phase 3: 18 req/s (should trigger prefill scaling to 2P1D)
+        - Phase 1: 8 req/s (baseline, should maintain 1P1D)
+        - Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)

        Returns:
            Dictionary with complete test results
@@ -269,8 +272,7 @@ class LoadGenerator:
        # Graduated test parameters (optimized for prefill scaling)
        phases: List[Dict[str, Any]] = [
            {"rate": 8.0, "duration": 90, "name": "baseline"},
-            {"rate": 15.0, "duration": 120, "name": "moderate"},
-            {"rate": 25.0, "duration": 180, "name": "prefill_scaling_trigger"},
+            {"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
        ]
        transition_delay = 30