name: Test Disaggregation Mode on: push: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - 'scripts/ci_start_disaggregation_servers.sh' - 'sgl-router/**' pull_request: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - 'scripts/ci_start_disaggregation_servers.sh' - 'sgl-router/**' workflow_dispatch: concurrency: group: test-disaggregation-${{ github.ref }} cancel-in-progress: true permissions: contents: read pull-requests: write issues: write jobs: test-disaggregation: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: [h200] timeout-minutes: 45 steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 10 - name: Setup Python uses: actions/setup-python@v4 with: python-version: '3.11' - name: Setup Rust run: | bash scripts/ci_install_rust.sh - name: Cache Rust dependencies uses: actions/cache@v4 with: path: | ~/.cargo/bin/ ~/.cargo/registry/index/ ~/.cargo/registry/cache/ ~/.cargo/git/db/ sgl-router/target/ key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }} restore-keys: | ${{ runner.os }}-cargo- - name: Cache pip dependencies uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }} restore-keys: | ${{ runner.os }}-pip- - name: Validate environment run: | echo "=== System Validation ===" nvidia-smi echo "GPU count: $(nvidia-smi -L | wc -l)" if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then echo "Error: This test requires at least 8 GPUs" exit 1 fi echo "=== RDMA Validation ===" if ! command -v ibv_devices >/dev/null 2>&1; then echo "Error: InfiniBand tools not found" exit 1 fi # Check for active IB devices found_active_device=false for device in mlx5_{0..11}; do if ibv_devinfo $device >/dev/null 2>&1; then state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}') if [[ "$state" == "PORT_ACTIVE" ]]; then echo "✓ Found active device: $device" found_active_device=true break fi fi done if [ "$found_active_device" = false ]; then echo "Error: No active IB devices found" echo "Available devices:" ibv_devices || true exit 1 fi echo "=== Model Validation ===" if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then echo "Error: Model not found" ls -la /raid/models/ || echo "No models directory" exit 1 fi echo "✓ Model found" - name: Install SGLang dependencies run: | echo "Installing SGLang with all extras..." python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.4.post2 - name: Build and install sgl-router run: | source "$HOME/.cargo/env" echo "Building sgl-router..." cd sgl-router cargo build && python3 -m build && pip install --force-reinstall dist/*.whl - name: Start disaggregation servers id: start_servers run: | echo "Starting disaggregation servers..." bash scripts/ci_start_disaggregation_servers.sh & SERVER_PID=$! echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT # Wait for all 8 servers to be healthy (script already does this) wait_count=0 while [ $wait_count -lt 30 ]; do if ps -p $SERVER_PID > /dev/null; then # Check if the startup script printed success message sleep 2 wait_count=$((wait_count + 1)) else # Script exited - check if it was successful wait $SERVER_PID exit_code=$? if [ $exit_code -eq 0 ]; then echo "✓ All disaggregation servers are healthy" break else echo "Error: Server startup failed with code $exit_code" exit 1 fi fi done echo "✓ Servers started (PID: $SERVER_PID)" - name: Test all policies sequentially timeout-minutes: 30 run: | POLICIES=("random" "round_robin" "cache_aware" "power_of_two") BASE_URL="http://127.0.0.9:8000" for policy in "${POLICIES[@]}"; do echo "" echo "==================================================" echo "Testing policy: $policy" echo "==================================================" # Start router with the current policy echo "Starting router with policy: $policy..." python3 -m sglang_router.launch_router \ --pd-disaggregation \ --policy "$policy" \ --prefill http://127.0.0.1:30001 9001 \ --prefill http://127.0.0.2:30002 9002 \ --prefill http://127.0.0.3:30003 9003 \ --prefill http://127.0.0.4:30004 9004 \ --decode http://127.0.0.5:30005 \ --decode http://127.0.0.6:30006 \ --decode http://127.0.0.7:30007 \ --decode http://127.0.0.8:30008 \ --host 127.0.0.9 \ --port 8000 & ROUTER_PID=$! # Wait for router to become healthy echo "Waiting for router to become healthy..." TIMEOUT=60 ELAPSED=0 while [ $ELAPSED -lt $TIMEOUT ]; do if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then echo "✓ Router is reachable" break fi if ! ps -p $ROUTER_PID > /dev/null; then echo "Error: Router process died" exit 1 fi sleep 5 ELAPSED=$((ELAPSED + 5)) done if [ $ELAPSED -ge $TIMEOUT ]; then echo "Error: Router health check timeout" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Test API functionality echo "Testing API completions for $policy..." response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer test-token" \ -d '{ "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"} ], "stream": false, "max_tokens": 100 }') if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then echo "✓ API test passed for $policy" else echo "✗ API test failed for $policy: $response" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Test streaming echo "Testing streaming API for $policy..." stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer test-token" \ -d '{ "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "user", "content": "Count from 1 to 5"} ], "stream": true, "max_tokens": 50 }') if echo "$stream_response" | grep -q "data:"; then echo "✓ Streaming API test passed for $policy" else echo "✗ Streaming API test failed for $policy" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Run benchmark echo "Running benchmark for $policy..." benchmark_output=$(python3 -m sglang.bench_one_batch_server \ --model-path "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \ --base-url "http://127.0.0.9:8000" \ --batch-size 8 \ --input-len 4096 \ --output-len 5 \ --skip-warmup) echo "$benchmark_output" # Save benchmark output echo "$benchmark_output" > "benchmark_${policy}.txt" # Extract and validate metrics latency=$(echo "$benchmark_output" | grep "latency:" | awk '{print $2}' | sed 's/s//') input_throughput=$(echo "$benchmark_output" | grep "input throughput:" | awk '{print $3}') output_throughput=$(echo "$benchmark_output" | grep "output throughput:" | awk '{print $3}') command -v bc >/dev/null || (apt-get update && apt-get install -y bc) echo "Performance for $policy: ${latency}s | ${input_throughput} | ${output_throughput} tok/s" # Validate performance fail="" (( $(echo "$latency > 1.5" | bc -l) )) && fail="Latency too high (${latency}s>1.5s) " (( $(echo "$input_throughput < 20000" | bc -l) )) && fail="${fail}Input too low (${input_throughput}<20k) " (( $(echo "$output_throughput < 1000" | bc -l) )) && fail="${fail}Output too low (${output_throughput}<1k) " if [ -n "$fail" ]; then echo "✗ Benchmark failed for $policy: $fail" kill $ROUTER_PID 2>/dev/null || true exit 1 else echo "✓ Performance validation passed for $policy" fi # Stop router before testing next policy echo "Stopping router for $policy..." # First try graceful shutdown kill $ROUTER_PID 2>/dev/null || true # Wait up to 5 seconds for graceful shutdown for i in {1..5}; do if ! ps -p $ROUTER_PID > /dev/null 2>&1; then echo "Router stopped gracefully" break fi sleep 1 done # Force kill if still running if ps -p $ROUTER_PID > /dev/null 2>&1; then echo "Force killing router..." kill -9 $ROUTER_PID 2>/dev/null || true fi # Short delay to ensure port is released sleep 2 echo "✓ Completed testing for $policy" done echo "" echo "✅ All policies tested successfully!" - name: Upload benchmark results if: success() uses: actions/upload-artifact@v4 with: name: benchmark-results-all-policies path: benchmark_*.txt - name: Cleanup servers if: always() run: | if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then pkill -P ${{ steps.start_servers.outputs.server_pid }} || true kill ${{ steps.start_servers.outputs.server_pid }} || true fi pkill -f "sglang.launch_server" || true sleep 5 remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0") echo "Cleanup completed. Remaining processes: $remaining" summarize-benchmarks: needs: test-disaggregation runs-on: ubuntu-latest if: success() steps: - name: Download benchmark results uses: actions/download-artifact@v4 with: name: benchmark-results-all-policies - name: Create benchmark summary run: | echo "## PD Router Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Policy | Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY echo "|--------|-------------|-------------------------|--------------------------|" >> $GITHUB_STEP_SUMMARY for policy in random round_robin cache_aware power_of_two; do if [ -f "benchmark_${policy}.txt" ]; then latency=$(grep "latency:" "benchmark_${policy}.txt" | awk '{print $2}') input_throughput=$(grep "input throughput:" "benchmark_${policy}.txt" | awk '{print $3}') output_throughput=$(grep "output throughput:" "benchmark_${policy}.txt" | awk '{print $3}') echo "| ${policy} | ${latency} | ${input_throughput} | ${output_throughput} |" >> $GITHUB_STEP_SUMMARY fi done echo "" >> $GITHUB_STEP_SUMMARY echo "✅ All policies tested successfully!" >> $GITHUB_STEP_SUMMARY