"launch/dynamo-run/vscode:/vscode.git/clone" did not exist on "f0ca16f0333ec37a1738e0590109b29a9bada8bf"
Unverified Commit ab5a31b5 authored by Alec's avatar Alec Committed by GitHub
Browse files

test(planner): isolate planner-family suites [DYN-2534] (#7723)

parent cc22114d
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
memory: "100Gi" memory: "100Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- /bin/sh - /bin/sh
......
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
memory: "100Gi" memory: "100Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- /bin/sh - /bin/sh
......
...@@ -40,7 +40,7 @@ spec: ...@@ -40,7 +40,7 @@ spec:
memory: "100Gi" memory: "100Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- /bin/sh - /bin/sh
...@@ -72,17 +72,17 @@ spec: ...@@ -72,17 +72,17 @@ spec:
failureThreshold: 10 failureThreshold: 10
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-planner:my-tag
ports: ports:
- name: metrics - name: metrics
containerPort: 9085 containerPort: 9085
command: command:
- python3 - python3
- -m - -m
- dynamo.planner - dynamo.planner
args: args:
- --config - --config
- '{"environment": "kubernetes", "backend": "vllm", "ttft": 200, "itl": 10, "profile_results_dir": "/workspace/tests/planner/profiling_results/H200_TP1P_TP1D/", "throughput_adjustment_interval": 60, "metric_reporting_prometheus_port": 9085, "no_correction": true}' - '{"environment": "kubernetes", "backend": "vllm", "ttft": 200, "itl": 10, "profile_results_dir": "/workspace/components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D/", "throughput_adjustment_interval": 60, "metric_reporting_prometheus_port": 9085, "no_correction": true}'
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
......
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
memory: "100Gi" memory: "100Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- /bin/sh - /bin/sh
......
...@@ -17,25 +17,25 @@ spec: ...@@ -17,25 +17,25 @@ spec:
app: vllm-runtime-image-cache app: vllm-runtime-image-cache
spec: spec:
imagePullSecrets: imagePullSecrets:
- name: nvcr-imagepullsecret - name: nvcr-imagepullsecret
containers: containers:
- name: image-cache - name: image-cache
image: my-registry/vllm-runtime:my-tag image: my-registry/vllm-runtime:my-tag
command: command:
- /bin/sh - /bin/sh
- -c - -c
- "sleep infinity" - "sleep infinity"
resources: resources:
requests: requests:
cpu: "10m" cpu: "10m"
memory: "64Mi" memory: "64Mi"
limits: limits:
cpu: "100m" cpu: "100m"
memory: "128Mi" memory: "128Mi"
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/master
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule
- key: node-role.kubernetes.io/control-plane - key: node-role.kubernetes.io/control-plane
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule
...@@ -12,7 +12,7 @@ spec: ...@@ -12,7 +12,7 @@ spec:
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- python3 - python3
...@@ -26,11 +26,11 @@ spec: ...@@ -26,11 +26,11 @@ spec:
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/dynamo-planner:my-tag
command: command:
- python3 - python3
- -m - -m
- dynamo.planner - dynamo.planner
args: args:
- --config - --config
- '{"environment": "kubernetes", "backend": "vllm", "enable_load_scaling": true, "enable_throughput_scaling": false, "pre_deployment_sweeping_mode": "none", "load_adjustment_interval": 5, "load_min_observations": 5}' - '{"environment": "kubernetes", "backend": "vllm", "enable_load_scaling": true, "enable_throughput_scaling": false, "pre_deployment_sweeping_mode": "none", "load_adjustment_interval": 5, "load_min_observations": 5}'
......
...@@ -12,20 +12,20 @@ spec: ...@@ -12,20 +12,20 @@ spec:
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag
Planner: Planner:
componentType: planner componentType: planner
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/dynamo-planner:my-tag
command: command:
- python3 - python3
- -m - -m
- dynamo.planner - dynamo.planner
args: args:
- --config - --config
- '{"environment": "kubernetes", "backend": "vllm", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/tests/planner/profiling_results/H200_TP1P_TP1D", "no_correction": true}' - '{"environment": "kubernetes", "backend": "vllm", "throughput_adjustment_interval": 60, "profile_results_dir": "/workspace/components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D", "no_correction": true}'
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
......
#!/bin/bash #!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Run SLA planner scaling end-to-end test
# This script:
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 4. Runs the scaling test (8 req/s -> 18 req/s)
# 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set -e set -e
# Configuration
NAMESPACE=${NAMESPACE:-default} NAMESPACE=${NAMESPACE:-default}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py" TEST_FILE="$SCRIPT_DIR/scaling_e2e.py"
FRONTEND_PORT=8000 FRONTEND_PORT=8000
LOCAL_PORT=8000 LOCAL_PORT=""
DEPLOYMENT_NAME="vllm-disagg-planner" DEPLOYMENT_NAME="vllm-disagg-planner"
SAVE_RESULTS=false SAVE_RESULTS=false
MODE="throughput" MODE="throughput"
DEPLOYED_BY_US=false
# Colors for output
RED='\033[0;31m' RED='\033[0;31m'
GREEN='\033[0;32m' GREEN='\033[0;32m'
YELLOW='\033[1;33m' YELLOW='\033[1;33m'
BLUE='\033[0;34m' BLUE='\033[0;34m'
NC='\033[0m' # No Color NC='\033[0m'
log_info() { log_info() {
echo -e "${BLUE}[INFO]${NC} $1" echo -e "${BLUE}[INFO]${NC} $1"
...@@ -49,7 +37,21 @@ log_error() { ...@@ -49,7 +37,21 @@ log_error() {
echo -e "${RED}[ERROR]${NC} $1" echo -e "${RED}[ERROR]${NC} $1"
} }
# Check prerequisites find_free_local_port() {
local python_cmd="python3"
if ! command -v python3 &> /dev/null; then
python_cmd="python"
fi
"$python_cmd" - <<'PY'
import socket
with socket.socket() as sock:
sock.bind(("127.0.0.1", 0))
print(sock.getsockname()[1])
PY
}
check_prerequisites() { check_prerequisites() {
log_info "Checking prerequisites..." log_info "Checking prerequisites..."
...@@ -68,48 +70,40 @@ check_prerequisites() { ...@@ -68,48 +70,40 @@ check_prerequisites() {
exit 1 exit 1
fi fi
# Check for aiperf
if ! command -v aiperf &> /dev/null; then if ! command -v aiperf &> /dev/null; then
log_error "aiperf not found. This tool is required for load generation." log_error "aiperf not found. This tool is required for load generation."
log_error "Please install the required dependencies by following the instructions in tests/planner/README.md" log_error "Follow components/src/dynamo/planner/tests/manual/README.md for setup."
exit 1 exit 1
fi fi
log_success "Prerequisites check passed" log_success "Prerequisites check passed"
} }
# Check if deployment already exists and is running
check_existing_deployment() { check_existing_deployment() {
log_info "Checking for existing deployment..." log_info "Checking for existing deployment..."
# Check for the DynamoGraphDeployment custom resource
if kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" &> /dev/null; then if kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" &> /dev/null; then
log_info "DynamoGraphDeployment $DEPLOYMENT_NAME already exists - skipping redeployment" log_info "DynamoGraphDeployment $DEPLOYMENT_NAME already exists - skipping redeployment"
# Check if the DynamoGraphDeployment is ready
local status local status
status=$(kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" -o jsonpath='{.status.state}') status=$(kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" -o jsonpath='{.status.state}')
if [ "$status" = "successful" ]; then if [ "$status" = "successful" ]; then
# Check if frontend pod is running
# Note: operator automatically prefixes k8s namespace to dynamo-namespace
if kubectl get pods -n "$NAMESPACE" -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=${NAMESPACE}-vllm-disagg-planner" --field-selector=status.phase=Running | grep -q .; then if kubectl get pods -n "$NAMESPACE" -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=${NAMESPACE}-vllm-disagg-planner" --field-selector=status.phase=Running | grep -q .; then
log_success "Existing deployment is ready" log_success "Existing deployment is ready"
return 0 return 0
else
log_warning "Existing deployment pods are not ready, will redeploy"
return 1
fi fi
else log_warning "Existing deployment pods are not ready, will redeploy"
log_warning "Existing deployment is not ready (status: $status), will redeploy"
return 1 return 1
fi fi
else
log_info "No existing deployment found" log_warning "Existing deployment is not ready (status: $status), will redeploy"
return 1 return 1
fi fi
log_info "No existing deployment found"
return 1
} }
# Deploy the planner
deploy_planner() { deploy_planner() {
log_info "Deploying SLA planner..." log_info "Deploying SLA planner..."
...@@ -118,55 +112,28 @@ deploy_planner() { ...@@ -118,55 +112,28 @@ deploy_planner() {
exit 1 exit 1
fi fi
# Apply the deployment kubectl apply -f "$YAML_FILE" -n "$NAMESPACE"
if kubectl apply -f "$YAML_FILE" -n "$NAMESPACE"; then log_success "Deployment applied successfully"
log_success "Deployment applied successfully"
else
log_error "Failed to apply deployment"
exit 1
fi
log_info "Waiting for DynamoGraphDeployment to be processed..." log_info "Waiting for DynamoGraphDeployment to be processed..."
if kubectl wait --for=condition=Ready dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=600s; then kubectl wait --for=condition=Ready dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=600s
log_success "DynamoGraphDeployment is ready" log_success "DynamoGraphDeployment is ready"
else
log_error "DynamoGraphDeployment failed to become ready within timeout"
exit 1
fi
log_info "Waiting for pods to be running (this may take several minutes for image pulls)..."
log_info "Waiting for frontend pod..." log_info "Waiting for frontend pod..."
# Note: operator automatically prefixes k8s namespace to dynamo-namespace kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=${NAMESPACE}-vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s
if kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=${NAMESPACE}-vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s; then log_success "Frontend pod is ready"
log_success "Frontend pod is ready"
else
log_error "Frontend pod failed to become ready within timeout"
exit 1
fi
log_info "Waiting for all pods to be running..." log_info "Waiting for planner pod..."
kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=planner,nvidia.com/dynamo-namespace=${NAMESPACE}-vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s
sleep 30 sleep 30
} }
setup_port_forward() { setup_port_forward() {
log_info "Setting up port forwarding..." log_info "Setting up port forwarding..."
LOCAL_PORT=$(find_free_local_port)
# Kill any existing port forward on the same port log_info "Using local port $LOCAL_PORT for frontend port-forward"
if lsof -ti:$LOCAL_PORT &> /dev/null; then
log_warning "Port $LOCAL_PORT is already in use, attempting to free it..."
kill "$(lsof -ti:$LOCAL_PORT)" 2>/dev/null || true
sleep 2
fi
local frontend_service="vllm-disagg-planner-frontend" local frontend_service="vllm-disagg-planner-frontend"
if ! kubectl get service "$frontend_service" -n "$NAMESPACE" &> /dev/null; then
log_error "Frontend service '$frontend_service' not found"
return 1
fi
log_info "Port forwarding to service: $frontend_service"
kubectl port-forward service/"$frontend_service" "$LOCAL_PORT:$FRONTEND_PORT" -n "$NAMESPACE" >/dev/null 2>&1 & kubectl port-forward service/"$frontend_service" "$LOCAL_PORT:$FRONTEND_PORT" -n "$NAMESPACE" >/dev/null 2>&1 &
PORT_FORWARD_PID=$! PORT_FORWARD_PID=$!
...@@ -194,11 +161,14 @@ cleanup_port_forward() { ...@@ -194,11 +161,14 @@ cleanup_port_forward() {
cleanup_deployment() { cleanup_deployment() {
log_info "Cleaning up deployment..." log_info "Cleaning up deployment..."
kubectl delete -f "$YAML_FILE" -n "$NAMESPACE" --ignore-not-found kubectl delete -f "$YAML_FILE" -n "$NAMESPACE" --ignore-not-found
log_info "Waiting for cleanup to complete..."
kubectl wait --for=delete dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=120s || true kubectl wait --for=delete dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=120s || true
}
log_info "Cleanup complete" cleanup() {
cleanup_port_forward
if [ "$DEPLOYED_BY_US" = true ]; then
cleanup_deployment
fi
} }
run_test() { run_test() {
...@@ -209,19 +179,13 @@ run_test() { ...@@ -209,19 +179,13 @@ run_test() {
python_cmd="python" python_cmd="python"
fi fi
local test_args="--namespace $NAMESPACE --mode $MODE" local test_args="--namespace $NAMESPACE --mode $MODE --base-url http://localhost:$LOCAL_PORT"
if [ "$SAVE_RESULTS" = true ]; then if [ "$SAVE_RESULTS" = true ]; then
test_args="$test_args --save-results" test_args="$test_args --save-results"
log_info "Results will be saved to tests/planner/e2e_scaling_results" log_info "Results will be saved to components/src/dynamo/planner/tests/e2e_scaling_results"
fi fi
if $python_cmd "$TEST_FILE" $test_args; then $python_cmd "$TEST_FILE" $test_args
log_success "Scaling test PASSED"
return 0
else
log_error "Scaling test FAILED"
return 1
fi
} }
main() { main() {
...@@ -245,70 +209,31 @@ main() { ...@@ -245,70 +209,31 @@ main() {
;; ;;
--help) --help)
echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]" echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]"
echo ""
echo "Run SLA planner scaling test (graduated 8->18 req/s prefill scaling)"
echo ""
echo "Options:"
echo " --namespace NS Kubernetes namespace (default: default)"
echo " --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo " --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo " --help Show this help"
exit 0 exit 0
;; ;;
*) *)
log_error "Unknown option: $1" log_error "Unknown option: $1"
echo "Use --help for usage information"
exit 1 exit 1
;; ;;
esac esac
done done
# Select YAML based on mode
if [ "$MODE" = "load" ]; then if [ "$MODE" = "load" ]; then
YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml" YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml"
else else
YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml" YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml"
fi fi
log_info "SLA Planner Scaling Test"
log_info "Namespace: $NAMESPACE"
log_info "Mode: $MODE"
log_info "YAML: $YAML_FILE"
log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites check_prerequisites
trap cleanup EXIT
trap cleanup_port_forward EXIT
# Check if we need to deploy
local deployed_by_us=false
if ! check_existing_deployment; then if ! check_existing_deployment; then
deploy_planner deploy_planner
deployed_by_us=true DEPLOYED_BY_US=true
fi
if ! setup_port_forward; then
log_error "Failed to setup port forwarding"
exit 1
fi
local test_result=0
if ! run_test; then
test_result=1
fi
# Only cleanup deployment if we deployed it
if [ "$deployed_by_us" = true ]; then
cleanup_deployment
fi
if [ $test_result -eq 0 ]; then
log_success "Test completed successfully!"
else
log_error "Test failed!"
fi fi
exit $test_result setup_port_forward
run_test
} }
main "$@" main "$@"
\ No newline at end of file
...@@ -2,31 +2,28 @@ ...@@ -2,31 +2,28 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
End-to-end test for SLA planner scaling behavior. Manual end-to-end scaling check for the SLA planner.
This test assumes a disaggregated planner deployment is already running This script intentionally lives outside the automated test tree so it can be kept in the
and accessible at localhost:8000. It monitors pod scaling and validates planner image without being collected by pytest.
that the planner correctly scales from 1P1D to 2P1D when load increases
through graduated phases: 8 req/s (baseline) → 15 req/s (moderate) → 25 req/s (prefill scaling trigger).
""" """
import asyncio import asyncio
import json import json
import logging import logging
import subprocess import subprocess
import sys
import time import time
import urllib.request
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from utils.load_generator import LoadGenerator from dynamo.planner.tests.unit.load_generator import LoadGenerator
logging.basicConfig( logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Test configuration constants
HEALTH_CHECK_TIMEOUT = 10 HEALTH_CHECK_TIMEOUT = 10
PORT_FORWARD_SETUP_DELAY = 3 PORT_FORWARD_SETUP_DELAY = 3
FINAL_STABILIZATION_DELAY = 60 FINAL_STABILIZATION_DELAY = 60
...@@ -58,19 +55,17 @@ class KubernetesMonitor: ...@@ -58,19 +55,17 @@ class KubernetesMonitor:
self.pod_history: List[PodCounts] = [] self.pod_history: List[PodCounts] = []
def _run_kubectl(self, cmd: List[str]) -> Tuple[bool, str]: def _run_kubectl(self, cmd: List[str]) -> Tuple[bool, str]:
"""Run kubectl command and return success status and output."""
try: try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
return result.returncode == 0, result.stdout.strip() return result.returncode == 0, result.stdout.strip()
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
logger.error(f"kubectl command timed out: {' '.join(cmd)}") logger.error("kubectl command timed out: %s", " ".join(cmd))
return False, "" return False, ""
except Exception as e: except OSError as exc:
logger.error(f"kubectl command failed: {e}") logger.error("kubectl command failed: %s", exc)
return False, "" return False, ""
def get_pod_counts(self) -> Optional[PodCounts]: def get_pod_counts(self) -> Optional[PodCounts]:
"""Get current pod counts for prefill and decode workers."""
cmd = [ cmd = [
"kubectl", "kubectl",
"get", "get",
...@@ -101,7 +96,6 @@ class KubernetesMonitor: ...@@ -101,7 +96,6 @@ class KubernetesMonitor:
"nvidia.com/dynamo-sub-component-type", "" "nvidia.com/dynamo-sub-component-type", ""
) )
# Only count Running pods
if pod_phase == "Running": if pod_phase == "Running":
if sub_component == "prefill": if sub_component == "prefill":
prefill_pods += 1 prefill_pods += 1
...@@ -117,19 +111,18 @@ class KubernetesMonitor: ...@@ -117,19 +111,18 @@ class KubernetesMonitor:
decode_pods=decode_pods, decode_pods=decode_pods,
total_pods=total_pods, total_pods=total_pods,
) )
self.pod_history.append(counts) self.pod_history.append(counts)
return counts return counts
except json.JSONDecodeError as exc:
except Exception as e: logger.error("Failed to parse pod counts: %s", exc)
logger.error(f"Failed to parse pod counts: {e}")
return None return None
async def monitor_scaling( async def monitor_scaling(
self, duration: int, interval: int = 10 self, duration: int, interval: int = 10
) -> List[PodCounts]: ) -> List[PodCounts]:
"""Monitor pod scaling for a given duration.""" logger.info(
logger.info(f"Monitoring pod scaling for {duration}s (interval: {interval}s)") "Monitoring pod scaling for %ss (interval: %ss)", duration, interval
)
start_time = time.time() start_time = time.time()
monitoring_data = [] monitoring_data = []
...@@ -138,123 +131,14 @@ class KubernetesMonitor: ...@@ -138,123 +131,14 @@ class KubernetesMonitor:
counts = self.get_pod_counts() counts = self.get_pod_counts()
if counts: if counts:
monitoring_data.append(counts) monitoring_data.append(counts)
logger.info(f"Pod counts: {counts}") logger.info("Pod counts: %s", counts)
await asyncio.sleep(interval) await asyncio.sleep(interval)
return monitoring_data return monitoring_data
def wait_for_deployment_ready(self, timeout: int = 300) -> bool:
"""Wait for deployment to be ready."""
logger.info(f"Waiting for deployment {self.deployment_name} to be ready...")
cmd = [
"kubectl",
"wait",
"--for=condition=available",
f"deployment/{self.deployment_name}",
"-n",
self.namespace,
f"--timeout={timeout}s",
]
success, output = self._run_kubectl(cmd)
if success:
logger.info("Deployment is ready")
return True
else:
logger.error(f"Deployment failed to become ready: {output}")
return False
def apply_deployment(self, yaml_file: str) -> bool:
"""Apply Kubernetes deployment from YAML file."""
logger.info(f"Applying deployment from {yaml_file}")
cmd = ["kubectl", "apply", "-f", yaml_file, "-n", self.namespace]
success, output = self._run_kubectl(cmd)
if success:
logger.info("Deployment applied successfully")
return True
else:
logger.error(f"Failed to apply deployment: {output}")
return False
def delete_deployment(self, yaml_file: str) -> bool:
"""Delete Kubernetes deployment."""
logger.info(f"Deleting deployment from {yaml_file}")
cmd = [
"kubectl",
"delete",
"-f",
yaml_file,
"-n",
self.namespace,
"--ignore-not-found",
]
success, output = self._run_kubectl(cmd)
if success:
logger.info("Deployment deleted successfully")
else:
logger.warning(f"Failed to delete deployment: {output}")
return success
def check_service_health(
self, service_name: str | None = None, port: int = 8000
) -> bool:
"""Check if the frontend service is healthy."""
if service_name is None:
service_name = f"{self.deployment_name}-frontend"
# Port forward to check health
cmd = [
"kubectl",
"port-forward",
f"service/{service_name}",
f"{port}:{port}",
"-n",
self.namespace,
]
proc = None
try:
# Start port forwarding in background
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Give it a moment to establish connection
time.sleep(PORT_FORWARD_SETUP_DELAY)
# Try to check health endpoint
try:
response = urllib.request.urlopen(
f"http://localhost:{port}/health", timeout=HEALTH_CHECK_TIMEOUT
)
healthy = response.status == 200
logger.info(f"Service health check: {'OK' if healthy else 'FAILED'}")
except Exception as e:
logger.warning(f"Health check failed: {e}")
healthy = False
return healthy
except Exception as e:
logger.error(f"Failed to check service health: {e}")
return False
finally:
# Ensure port forwarding is terminated
if proc is not None:
proc.terminate()
try:
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
class ScalingE2ETest: class ScalingE2ETest:
"""End-to-end test for SLA planner scaling behavior.""" """Manual end-to-end scaling validation for the SLA planner."""
def __init__( def __init__(
self, self,
...@@ -267,88 +151,56 @@ class ScalingE2ETest: ...@@ -267,88 +151,56 @@ class ScalingE2ETest:
self.base_url = base_url self.base_url = base_url
self.save_results = save_results self.save_results = save_results
self.mode = mode self.mode = mode
self.k8s_monitor = KubernetesMonitor(namespace) self.k8s_monitor = KubernetesMonitor(namespace)
self.load_generator = LoadGenerator( self.load_generator = LoadGenerator(
base_url=base_url, save_results=save_results base_url=base_url, save_results=save_results
) )
self.test_results: Dict[str, Any] = {} async def run_scaling_test(self) -> Dict[str, Any]:
logger.info("Starting manual scaling integration test (mode=%s)", self.mode)
async def run_scaling_test(self) -> Dict:
"""
Run the complete scaling test.
Hardcoded scenario:
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
logger.info(f"Starting scaling integration test (mode={self.mode})")
test_start_time = time.time() test_start_time = time.time()
# Record initial state
initial_counts = self.k8s_monitor.get_pod_counts() initial_counts = self.k8s_monitor.get_pod_counts()
logger.info(f"Test starting with: {initial_counts}") logger.info("Test starting with: %s", initial_counts)
# Start background monitoring total_test_duration = (
# Calculate based on actual phases from load generator 120 + 30 + 120 + BUFFER_DURATION
if self.mode == "load": if self.mode == "load"
# Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer else 90 + 30 + 120 + BUFFER_DURATION
total_test_duration = 120 + 30 + 120 + BUFFER_DURATION )
else:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
monitoring_task = asyncio.create_task( monitoring_task = asyncio.create_task(
self.k8s_monitor.monitor_scaling( self.k8s_monitor.monitor_scaling(
total_test_duration, interval=MONITORING_INTERVAL total_test_duration, interval=MONITORING_INTERVAL
) )
) )
# Initialize results in case of exception baseline_results: Dict[str, Any] = {}
baseline_results = {} trigger_results: Dict[str, Any] = {}
trigger_results = {}
try: try:
# Use the load generator's built-in scaling test
logger.info(
f"Running scaling scenario (8 req/s -> 18 req/s, mode={self.mode})"
)
load_results = await self.load_generator.run_scaling_test(mode=self.mode) load_results = await self.load_generator.run_scaling_test(mode=self.mode)
# Extract load results for analysis (2-phase structure)
phase_results = load_results.get("phase_results", {}) phase_results = load_results.get("phase_results", {})
baseline_results = phase_results.get("phase1_baseline", {}) baseline_results = phase_results.get("phase1_baseline", {})
trigger_results = phase_results.get("phase2_prefill_scaling_trigger", {}) trigger_results = phase_results.get("phase2_prefill_scaling_trigger", {})
# Check final pod counts
final_counts = self.k8s_monitor.get_pod_counts() final_counts = self.k8s_monitor.get_pod_counts()
logger.info(f"Final pod counts: {final_counts}") logger.info("Final pod counts: %s", final_counts)
# Wait a bit more to capture any delayed scaling
logger.info("Waiting for potential delayed scaling...") logger.info("Waiting for potential delayed scaling...")
await asyncio.sleep(FINAL_STABILIZATION_DELAY) await asyncio.sleep(FINAL_STABILIZATION_DELAY)
# Get final final counts
final_final_counts = self.k8s_monitor.get_pod_counts() final_final_counts = self.k8s_monitor.get_pod_counts()
logger.info(f"Final final pod counts: {final_final_counts}") logger.info("Final final pod counts: %s", final_final_counts)
except Exception as e:
logger.error(f"Test execution failed: {e}")
raise
finally: finally:
# Stop monitoring
monitoring_task.cancel() monitoring_task.cancel()
try: try:
await monitoring_task await monitoring_task
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
# Compile results return {
test_results: Dict[str, Any] = {
"test_duration": time.time() - test_start_time, "test_duration": time.time() - test_start_time,
"config": { "config": {
# Document actual test configuration
"baseline_rps": 8.0, "baseline_rps": 8.0,
"trigger_rps": 18.0, "trigger_rps": 18.0,
"phase_durations": {"baseline": 90, "trigger": 120}, "phase_durations": {"baseline": 90, "trigger": 120},
...@@ -365,16 +217,11 @@ class ScalingE2ETest: ...@@ -365,16 +217,11 @@ class ScalingE2ETest:
"scaling_analysis": self.analyze_scaling_behavior(), "scaling_analysis": self.analyze_scaling_behavior(),
} }
return test_results def analyze_scaling_behavior(self) -> Dict[str, Any]:
def analyze_scaling_behavior(self) -> Dict:
"""Analyze the scaling behavior from pod history."""
if len(self.k8s_monitor.pod_history) < 2: if len(self.k8s_monitor.pod_history) < 2:
return {"error": "Insufficient data for analysis"} return {"error": "Insufficient data for analysis"}
history = self.k8s_monitor.pod_history history = self.k8s_monitor.pod_history
# Find scaling events
scaling_events = [] scaling_events = []
for i in range(1, len(history)): for i in range(1, len(history)):
prev = history[i - 1] prev = history[i - 1]
...@@ -396,10 +243,8 @@ class ScalingE2ETest: ...@@ -396,10 +243,8 @@ class ScalingE2ETest:
} }
) )
# Check if expected scaling occurred
initial = history[0] initial = history[0]
final = history[-1] final = history[-1]
expected_scaling = { expected_scaling = {
"initial_1p1d": initial.prefill_pods == 1 and initial.decode_pods == 1, "initial_1p1d": initial.prefill_pods == 1 and initial.decode_pods == 1,
"final_2p1d": final.prefill_pods == 2 and final.decode_pods == 1, "final_2p1d": final.prefill_pods == 2 and final.decode_pods == 1,
...@@ -420,57 +265,41 @@ class ScalingE2ETest: ...@@ -420,57 +265,41 @@ class ScalingE2ETest:
"total_scaling_events": len(scaling_events), "total_scaling_events": len(scaling_events),
} }
def validate_test_results(self, results: Dict) -> Dict: def validate_test_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that the test achieved expected scaling behavior."""
validation: Dict[str, Any] = {"test_passed": False, "issues": [], "summary": ""} validation: Dict[str, Any] = {"test_passed": False, "issues": [], "summary": ""}
analysis = results.get("scaling_analysis")
# Check if we have the expected data if not analysis:
if not results.get("scaling_analysis"):
validation["issues"].append("No scaling analysis data") validation["issues"].append("No scaling analysis data")
return validation return validation
analysis = results["scaling_analysis"]
expected = analysis.get("expected_scaling", {}) expected = analysis.get("expected_scaling", {})
# Validate initial state
if not expected.get("initial_1p1d"): if not expected.get("initial_1p1d"):
validation["issues"].append("Test did not start with 1P1D configuration") validation["issues"].append("Test did not start with 1P1D configuration")
# Validate final state
if not expected.get("final_2p1d"): if not expected.get("final_2p1d"):
validation["issues"].append( validation["issues"].append(
"Test did not end with expected 2P1D configuration" "Test did not end with expected 2P1D configuration"
) )
# Validate scaling occurred
if not expected.get("scaling_occurred"): if not expected.get("scaling_occurred"):
validation["issues"].append("No scaling events detected") validation["issues"].append("No scaling events detected")
# Check if correct scaling occurred
if expected.get("correct_scaling"): if expected.get("correct_scaling"):
validation["test_passed"] = True validation["test_passed"] = True
validation[ validation["summary"] = "PASS: Successfully scaled from 1P1D to 2P1D"
"summary"
] = "✅ Test PASSED: Successfully scaled from 1P1D to 2P1D"
else: else:
validation[ validation[
"summary" "summary"
] = "❌ Test FAILED: Did not achieve expected 1P1D -> 2P1D scaling" ] = "FAIL: Did not achieve expected 1P1D -> 2P1D scaling"
# Add performance validation across all phases
baseline = results.get("baseline_results", {}) baseline = results.get("baseline_results", {})
trigger = results.get("trigger_results", {}) trigger = results.get("trigger_results", {})
if baseline.get("throughput", 0) > 0: if baseline.get("throughput", 0) > 0:
validation["baseline_throughput"] = f"{baseline['throughput']:.2f} req/s" validation["baseline_throughput"] = f"{baseline['throughput']:.2f} req/s"
if trigger.get("throughput", 0) > 0: if trigger.get("throughput", 0) > 0:
validation["trigger_throughput"] = f"{trigger['throughput']:.2f} req/s" validation["trigger_throughput"] = f"{trigger['throughput']:.2f} req/s"
return validation return validation
async def main(): async def main():
"""Main function for running the e2e test."""
import argparse import argparse
parser = argparse.ArgumentParser(description="SLA Planner Scaling E2E Test") parser = argparse.ArgumentParser(description="SLA Planner Scaling E2E Test")
...@@ -481,7 +310,10 @@ async def main(): ...@@ -481,7 +310,10 @@ async def main():
parser.add_argument( parser.add_argument(
"--save-results", "--save-results",
action="store_true", action="store_true",
help="Save results to tests/planner/e2e_scaling_results instead of /tmp", help=(
"Save results to components/src/dynamo/planner/tests/e2e_scaling_results "
"instead of /tmp"
),
) )
parser.add_argument( parser.add_argument(
"--mode", "--mode",
...@@ -491,7 +323,6 @@ async def main(): ...@@ -491,7 +323,6 @@ async def main():
) )
args = parser.parse_args() args = parser.parse_args()
test = ScalingE2ETest( test = ScalingE2ETest(
namespace=args.namespace, namespace=args.namespace,
base_url=args.base_url, base_url=args.base_url,
...@@ -500,57 +331,28 @@ async def main(): ...@@ -500,57 +331,28 @@ async def main():
) )
try: try:
# Check that service is accessible
logger.info(f"Checking service availability at {args.base_url}...")
# Run the scaling test
logger.info("Running scaling test...") logger.info("Running scaling test...")
results = await test.run_scaling_test() results = await test.run_scaling_test()
# Validate results
validation = test.validate_test_results(results) validation = test.validate_test_results(results)
# Save results
timestamp = int(time.time()) timestamp = int(time.time())
results_file = f"/tmp/scaling_test_results_{timestamp}.json" results_file = f"/tmp/scaling_test_results_{timestamp}.json"
with open(results_file, "w") as f: with open(results_file, "w") as handle:
json.dump({"results": results, "validation": validation}, f, indent=2) json.dump({"results": results, "validation": validation}, handle, indent=2)
# Print summary
logger.info("=" * 60) logger.info("=" * 60)
logger.info("TEST SUMMARY") logger.info("TEST SUMMARY")
logger.info("=" * 60) logger.info("=" * 60)
logger.info(validation["summary"]) logger.info(validation["summary"])
for issue in validation["issues"]:
if validation["issues"]: logger.info("Issue: %s", issue)
logger.info("\nIssues found:") logger.info("Detailed results saved to: %s", results_file)
for issue in validation["issues"]:
logger.info(f" - {issue}")
if any(k.endswith("_throughput") for k in validation.keys()):
logger.info("\nPerformance:")
if "baseline_throughput" in validation:
logger.info(
f" Baseline (8 req/s): {validation['baseline_throughput']}"
)
if "moderate_throughput" in validation:
logger.info(
f" Moderate (15 req/s): {validation['moderate_throughput']}"
)
if "trigger_throughput" in validation:
logger.info(f" Trigger (25 req/s): {validation['trigger_throughput']}")
logger.info(f"\nDetailed results saved to: {results_file}")
logger.info("=" * 60) logger.info("=" * 60)
return 0 if validation["test_passed"] else 1 return 0 if validation["test_passed"] else 1
except Exception:
except Exception as e: logger.exception("Test failed unexpectedly")
logger.error(f"Test failed with error: {e}") raise
return 1
if __name__ == "__main__": if __name__ == "__main__":
import sys
sys.exit(asyncio.run(main())) sys.exit(asyncio.run(main()))
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This is a simple manual load-test helper for planner validation.
# To validate:
# 1. Run a 1P1D disaggregated deployment.
# 2. Start planner with the desired config.
# 3. Run ./load_test.sh <num_requests>.
# Expected behavior is scale up and then back down after the burst.
if [ $# -ne 1 ]; then
echo "Usage: $0 <number_of_executions>"
exit 1
fi
executions=$1
echo "Starting $executions non-blocking executions..."
for (( i=1; i<=$executions; i++ )); do
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "Generate a long response to produce sustained planner load."
}
],
"stream": true,
"max_tokens": 500
}' > /dev/null 2>&1 &
done
echo "All $executions executions have been launched!"
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import logging import logging
......
...@@ -353,7 +353,7 @@ async def main(): ...@@ -353,7 +353,7 @@ async def main():
parser.add_argument( parser.add_argument(
"--save-results", "--save-results",
action="store_true", action="store_true",
help="Save results to tests/planner/e2e_scaling_results instead of /tmp", help="Save results to components/src/dynamo/planner/tests/data instead of /tmp",
) )
args = parser.parse_args() args = parser.parse_args()
......
...@@ -22,6 +22,13 @@ from kubernetes import client ...@@ -22,6 +22,13 @@ from kubernetes import client
from dynamo.planner.connectors.kubernetes_api import KubernetesAPI from dynamo.planner.connectors.kubernetes_api import KubernetesAPI
from dynamo.planner.errors import DynamoGraphDeploymentNotFoundError from dynamo.planner.errors import DynamoGraphDeploymentNotFoundError
pytestmark = [
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.unit,
pytest.mark.planner,
]
@pytest.fixture @pytest.fixture
def mock_config(): def mock_config():
...@@ -322,28 +329,26 @@ async def test_wait_for_graph_deployment_ready_on_second_attempt( ...@@ -322,28 +329,26 @@ async def test_wait_for_graph_deployment_ready_on_second_attempt(
) )
@pytest.mark.asyncio def test_get_graph_deployment(k8s_api, mock_custom_api):
async def test_get_graph_deployment(k8s_api, mock_custom_api):
"""Test get_graph_deployment""" """Test get_graph_deployment"""
mock_deployment = {"metadata": {"name": "parent-dgd"}} mock_deployment = {"metadata": {"name": "parent-dgd"}}
with patch.object( with patch.object(
k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
) as mock_get: ) as mock_get:
result = await k8s_api.get_graph_deployment("parent-dgd") result = k8s_api.get_graph_deployment("parent-dgd")
assert result == mock_deployment assert result == mock_deployment
mock_get.assert_called_once_with("parent-dgd") mock_get.assert_called_once_with("parent-dgd")
@pytest.mark.asyncio def test_get_graph_deployment_not_found(k8s_api, mock_custom_api):
async def test_get_graph_deployment_not_found(k8s_api, mock_custom_api):
"""Test get_graph_deployment when deployment is not found""" """Test get_graph_deployment when deployment is not found"""
k8s_api.custom_api.get_namespaced_custom_object.side_effect = client.ApiException( k8s_api.custom_api.get_namespaced_custom_object.side_effect = client.ApiException(
status=404 status=404
) )
with pytest.raises(DynamoGraphDeploymentNotFoundError) as exc_info: with pytest.raises(DynamoGraphDeploymentNotFoundError) as exc_info:
await k8s_api.get_graph_deployment("parent-dgd") k8s_api.get_graph_deployment("parent-dgd")
exception = exc_info.value exception = exc_info.value
assert exception.deployment_name == "parent-dgd" assert exception.deployment_name == "parent-dgd"
......
...@@ -34,6 +34,13 @@ from dynamo.planner.monitoring.dgd_services import ( ...@@ -34,6 +34,13 @@ from dynamo.planner.monitoring.dgd_services import (
get_service_from_sub_component_type_or_name, get_service_from_sub_component_type_or_name,
) )
pytestmark = [
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.unit,
pytest.mark.planner,
]
@pytest.fixture @pytest.fixture
def mock_kube_api(): def mock_kube_api():
...@@ -64,8 +71,9 @@ def kubernetes_connector(mock_kube_api_class, monkeypatch): ...@@ -64,8 +71,9 @@ def kubernetes_connector(mock_kube_api_class, monkeypatch):
def test_kubernetes_connector_no_env_var(): def test_kubernetes_connector_no_env_var():
with pytest.raises(DeploymentValidationError) as exc_info: with patch("dynamo.planner.connectors.kubernetes.KubernetesAPI"):
KubernetesConnector("test-dynamo-namespace") with pytest.raises(DeploymentValidationError) as exc_info:
KubernetesConnector("test-dynamo-namespace")
exception = exc_info.value exception = exc_info.value
assert set(exception.errors) == { assert set(exception.errors) == {
......
...@@ -256,6 +256,7 @@ def _build_load_config(**overrides) -> PlannerConfig: ...@@ -256,6 +256,7 @@ def _build_load_config(**overrides) -> PlannerConfig:
profile_results_dir=os.path.join( profile_results_dir=os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
"..", "..",
"data",
"profiling_results", "profiling_results",
"H200_TP1P_TP1D", "H200_TP1P_TP1D",
), ),
......
...@@ -14,7 +14,7 @@ import tempfile ...@@ -14,7 +14,7 @@ import tempfile
from unittest.mock import AsyncMock, MagicMock, patch from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
from utils.load_generator import LoadGenerator from load_generator import LoadGenerator
pytestmark = [ pytestmark = [
pytest.mark.gpu_0, pytest.mark.gpu_0,
......
...@@ -31,7 +31,6 @@ pytestmark = [ ...@@ -31,7 +31,6 @@ pytestmark = [
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.unit, pytest.mark.unit,
pytest.mark.planner, pytest.mark.planner,
pytest.mark.vllm,
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment