run_scaling_test.sh 9.37 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Run SLA planner scaling end-to-end test
# This script:
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 4. Runs the hardcoded scaling test (12 req/s -> 24 req/s)
# 5. Cleans up

set -e

# Configuration
NAMESPACE=${NAMESPACE:-default}
YAML_FILE="disagg_planner.yaml"
FRONTEND_PORT=8000
LOCAL_PORT=8000
DEPLOYMENT_NAME="vllm-disagg-planner"
SAVE_RESULTS=false

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

log_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Check prerequisites
check_prerequisites() {
    log_info "Checking prerequisites..."

    if ! command -v kubectl &> /dev/null; then
        log_error "kubectl not found. Please install kubectl."
        exit 1
    fi

    if ! command -v python &> /dev/null && ! command -v python3 &> /dev/null; then
        log_error "Python not found. Please install Python."
        exit 1
    fi

    if ! kubectl cluster-info &> /dev/null; then
        log_error "Cannot connect to Kubernetes cluster."
        exit 1
    fi

    if [ ! -f "test_scaling_e2e.py" ]; then
        log_error "test_scaling_e2e.py not found. Make sure you're in the tests/planner directory."
        exit 1
    fi

    # Check for genai-perf
    if ! command -v genai-perf &> /dev/null; then
        log_warning "genai-perf not found. This tool is required for load generation."
        echo -n "Would you like us to install it for you? (y/n): "
        read -r response
        if [[ "$response" =~ ^[Yy]$ ]]; then
            log_info "Installing genai-perf and perf_analyzer..."
            # Install specific versions for reproducibility and security
            if pip install 'nvidia-ml-py3>=12.0.0' 'genai-perf>=0.0.4' 'tritonclient[all]>=2.48.0'; then
                log_success "genai-perf and perf_analyzer installed successfully"
            else
                log_error "Failed to install genai-perf. Please install it manually: pip install 'nvidia-ml-py3>=12.0.0' 'genai-perf>=0.0.4' 'tritonclient[all]>=2.48.0'"
                exit 1
            fi
        else
            log_error "genai-perf is required for the scaling test. Please install it: pip install 'nvidia-ml-py3>=12.0.0' 'genai-perf>=0.0.4' 'tritonclient[all]>=2.48.0'"
            exit 1
        fi
    fi

    log_success "Prerequisites check passed"
}

# Check if deployment already exists and is running
check_existing_deployment() {
    log_info "Checking for existing deployment..."

    # Check for the DynamoGraphDeployment custom resource
    if kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" &> /dev/null; then
        log_info "DynamoGraphDeployment $DEPLOYMENT_NAME already exists - skipping redeployment"

        # Check if the DynamoGraphDeployment is ready
        local status
        status=$(kubectl get dynamographdeployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" -o jsonpath='{.status.state}')
        if [ "$status" = "successful" ]; then
            # Check if frontend pod is running
            if kubectl get pods -n "$NAMESPACE" -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" --field-selector=status.phase=Running | grep -q .; then
                log_success "Existing deployment is ready"
                return 0
            else
                log_warning "Existing deployment pods are not ready, will redeploy"
                return 1
            fi
        else
            log_warning "Existing deployment is not ready (status: $status), will redeploy"
            return 1
        fi
    else
        log_info "No existing deployment found"
        return 1
    fi
}

# Deploy the planner
deploy_planner() {
    log_info "Deploying SLA planner..."

    if [ ! -f "$YAML_FILE" ]; then
        log_error "Deployment file $YAML_FILE not found"
        exit 1
    fi

    # Apply the deployment
    if kubectl apply -f "$YAML_FILE" -n "$NAMESPACE"; then
        log_success "Deployment applied successfully"
    else
        log_error "Failed to apply deployment"
        exit 1
    fi

    log_info "Waiting for DynamoGraphDeployment to be processed..."
    if kubectl wait --for=condition=Ready dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=600s; then
        log_success "DynamoGraphDeployment is ready"
    else
        log_error "DynamoGraphDeployment failed to become ready within timeout"
        exit 1
    fi

    log_info "Waiting for pods to be running (this may take several minutes for image pulls)..."

    log_info "Waiting for frontend pod..."
    if kubectl wait --for=condition=Ready pod -l "nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-namespace=vllm-disagg-planner" -n "$NAMESPACE" --timeout=900s; then
        log_success "Frontend pod is ready"
    else
        log_error "Frontend pod failed to become ready within timeout"
        exit 1
    fi

    log_info "Waiting for all pods to be running..."
    sleep 30
}

setup_port_forward() {
    log_info "Setting up port forwarding..."

    # Kill any existing port forward on the same port
    if lsof -ti:$LOCAL_PORT &> /dev/null; then
        log_warning "Port $LOCAL_PORT is already in use, attempting to free it..."
        kill "$(lsof -ti:$LOCAL_PORT)" 2>/dev/null || true
        sleep 2
    fi

    local frontend_service="vllm-disagg-planner-frontend"

    if ! kubectl get service "$frontend_service" -n "$NAMESPACE" &> /dev/null; then
        log_error "Frontend service '$frontend_service' not found"
        return 1
    fi

    log_info "Port forwarding to service: $frontend_service"
    kubectl port-forward service/"$frontend_service" "$LOCAL_PORT:$FRONTEND_PORT" -n "$NAMESPACE" >/dev/null 2>&1 &
    PORT_FORWARD_PID=$!

    log_info "Waiting for port forwarding to be established..."
    for i in {1..30}; do
        if curl -s http://localhost:$LOCAL_PORT/health &> /dev/null; then
            log_success "Port forwarding established and service is healthy"
            return 0
        fi
        sleep 2
    done

    log_error "Failed to establish port forwarding or service is not healthy"
    return 1
}

cleanup_port_forward() {
    if [ ! -z "$PORT_FORWARD_PID" ]; then
        log_info "Cleaning up port forwarding..."
        kill $PORT_FORWARD_PID 2>/dev/null || true
        wait $PORT_FORWARD_PID 2>/dev/null || true
    fi
}

cleanup_deployment() {
    log_info "Cleaning up deployment..."
    kubectl delete -f "$YAML_FILE" -n "$NAMESPACE" --ignore-not-found

    log_info "Waiting for cleanup to complete..."
    kubectl wait --for=delete dynamographdeployment/"$DEPLOYMENT_NAME" -n "$NAMESPACE" --timeout=120s || true

    log_info "Cleanup complete"
}

run_test() {
    log_info "Running scaling test (graduated 8->15->25 req/s)..."

    local python_cmd="python3"
    if ! command -v python3 &> /dev/null; then
        python_cmd="python"
    fi

    local test_args="--namespace $NAMESPACE"
    if [ "$SAVE_RESULTS" = true ]; then
        test_args="$test_args --save-results"
        log_info "Results will be saved to tests/planner/e2e_scaling_results"
    fi

    if $python_cmd test_scaling_e2e.py $test_args; then
        log_success "Scaling test PASSED"
        return 0
    else
        log_error "Scaling test FAILED"
        return 1
    fi
}

main() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            --namespace)
                NAMESPACE="$2"
                shift 2
                ;;
            --save-results)
                SAVE_RESULTS=true
                shift
                ;;
            --help)
                echo "Usage: $0 [--namespace NS] [--save-results]"
                echo ""
                echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
                echo ""
                echo "Options:"
                echo "  --namespace NS    Kubernetes namespace (default: default)"
                echo "  --save-results    Save results to tests/planner/e2e_scaling_results instead of /tmp"
                echo "  --help            Show this help"
                exit 0
                ;;
            *)
                log_error "Unknown option: $1"
                echo "Use --help for usage information"
                exit 1
                ;;
        esac
    done

    log_info "SLA Planner Scaling Test"
    log_info "Namespace: $NAMESPACE"
    log_info "Scenario: Graduated 8->15->25 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"

    check_prerequisites

    trap cleanup_port_forward EXIT

    # Check if we need to deploy
    local deployed_by_us=false
    if ! check_existing_deployment; then
        deploy_planner
        deployed_by_us=true
    fi

    if ! setup_port_forward; then
        log_error "Failed to setup port forwarding"
        exit 1
    fi

    local test_result=0
    if ! run_test; then
        test_result=1
    fi

    # Only cleanup deployment if we deployed it
    if [ "$deployed_by_us" = true ]; then
        cleanup_deployment
    fi

    if [ $test_result -eq 0 ]; then
        log_success "Test completed successfully!"
    else
        log_error "Test failed!"
    fi

    exit $test_result
}

main "$@"