name: PR Test (PD Router) on: push: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - 'scripts/ci/ci_start_disaggregation_servers.sh' - 'sgl-router/**' pull_request: branches: [ main ] paths: - 'python/sglang/srt/disaggregation/**' - 'scripts/ci/ci_start_disaggregation_servers.sh' - 'sgl-router/**' types: [synchronize, labeled] workflow_dispatch: concurrency: group: test-disaggregation-${{ github.ref }} cancel-in-progress: true permissions: contents: read pull-requests: write issues: write jobs: test-disaggregation: if: github.event_name != 'pull_request' || (contains(github.event.pull_request.labels.*.name, 'run-ci') && contains(github.event.pull_request.labels.*.name, 'router-benchmark')) runs-on: [8-gpu-h200-oracle] timeout-minutes: 45 steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 10 - name: Setup Python uses: actions/setup-python@v4 with: python-version: '3.12' - name: Setup Rust run: | bash scripts/ci/ci_install_rust.sh - name: Cache Rust dependencies uses: actions/cache@v4 with: path: | ~/.cargo/bin/ ~/.cargo/registry/index/ ~/.cargo/registry/cache/ ~/.cargo/git/db/ sgl-router/target/ key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }} restore-keys: | ${{ runner.os }}-cargo- - name: Cache pip dependencies uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }} restore-keys: | ${{ runner.os }}-pip- - name: Validate environment run: | echo "=== System Validation ===" nvidia-smi echo "GPU count: $(nvidia-smi -L | wc -l)" if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then echo "Error: This test requires at least 8 GPUs" exit 1 fi echo "=== GPU Process Check ===" # Fail fast if any GPU compute processes are active if command -v nvidia-smi >/dev/null 2>&1; then # Try to query compute apps first (preferred and concise) gpu_procs=$(nvidia-smi --query-compute-apps=pid,process_name,gpu_uuid --format=csv,noheader 2>/dev/null | sed '/^$/d' || true) # Fallback to detailed PIDS report if the query returns nothing but there might still be processes if [ -z "$gpu_procs" ]; then gpu_procs=$(nvidia-smi -q -d PIDS 2>/dev/null | awk '/Processes/{flag=1;next}/^$/{flag=0}flag' | sed '/^\s*Processes:/d' | sed '/^\s*$/d' || true) fi if [ -n "$gpu_procs" ]; then echo "Error: Found active GPU processes using the device(s):" echo "$gpu_procs" exit 1 else echo "No active GPU compute processes detected." fi else echo "Error: nvidia-smi not found; skipping GPU process check." exit 1 fi echo "=== RDMA Validation ===" if ! command -v ibv_devices >/dev/null 2>&1; then echo "Error: InfiniBand tools not found" exit 1 fi # Check for active IB devices found_active_device=false for device in mlx5_{0..11}; do if ibv_devinfo $device >/dev/null 2>&1; then state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}') if [[ "$state" == "PORT_ACTIVE" ]]; then echo "✓ Found active device: $device" found_active_device=true break fi fi done if [ "$found_active_device" = false ]; then echo "Error: No active IB devices found" echo "Available devices:" ibv_devices || true exit 1 fi echo "=== Model Validation ===" if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then echo "Error: Model not found" ls -la /raid/models/ || echo "No models directory" exit 1 fi echo "✓ Model found" - name: Install SGLang dependencies run: | echo "Installing SGLang with all extras..." python3 -m pip --no-cache-dir install --upgrade pip python3 -m pip --no-cache-dir install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128 python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.6.post1 python3 -m pip --no-cache-dir install --user --force-reinstall genai-bench==0.0.2 - name: Build and install sgl-router run: | source "$HOME/.cargo/env" echo "Building sgl-router..." cd sgl-router cargo build && python3 -m build && pip install --force-reinstall dist/*.whl - name: Start disaggregation servers id: start_servers run: | echo "Starting disaggregation servers..." READY_FILE=".disagg_ready" rm -f "$READY_FILE" DISAGG_READY_FILE="$READY_FILE" bash scripts/ci/ci_start_disaggregation_servers.sh & SERVER_PID=$! echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT # Wait until script signals readiness (8/8 healthy) or timeout TIMEOUT=300 ELAPSED=0 while [ $ELAPSED -lt $TIMEOUT ]; do if [ -f "$READY_FILE" ]; then echo "✓ All disaggregation servers are healthy (signal detected)" break fi if ! ps -p $SERVER_PID > /dev/null; then echo "Error: server bootstrap script exited prematurely" exit 1 fi sleep 5 ELAPSED=$((ELAPSED + 5)) done if [ $ELAPSED -ge $TIMEOUT ]; then echo "❌ Timeout waiting for disaggregation servers to be healthy" exit 1 fi echo "✓ Servers started (PID: $SERVER_PID)" - name: Test all policies sequentially timeout-minutes: 30 run: | POLICIES=("random" "round_robin" "cache_aware" "power_of_two") BASE_URL="http://127.0.0.9:8000" # Free commonly used ports for router and metrics echo "Freeing ports 29000 (metrics) and 8000 (API), if in use..." fuser -k -n tcp 29000 2>/dev/null || true fuser -k -n tcp 8000 2>/dev/null || true sleep 1 for policy in "${POLICIES[@]}"; do echo "" echo "==================================================" echo "Testing policy: $policy" echo "==================================================" # Free ports before starting router fuser -k -n tcp 29000 2>/dev/null || true fuser -k -n tcp 8000 2>/dev/null || true # Start router with the current policy echo "Starting router with policy: $policy..." RUST_BACKTRACE=1 python3 -m sglang_router.launch_router \ --pd-disaggregation \ --policy "$policy" \ --prefill http://127.0.0.1:30001 9001 \ --prefill http://127.0.0.2:30002 9002 \ --prefill http://127.0.0.3:30003 9003 \ --prefill http://127.0.0.4:30004 9004 \ --decode http://127.0.0.5:30005 \ --decode http://127.0.0.6:30006 \ --decode http://127.0.0.7:30007 \ --decode http://127.0.0.8:30008 \ --host 127.0.0.9 \ --log-level warn \ --port 8000 & ROUTER_PID=$! # Wait for router to become healthy echo "Waiting for router to become healthy..." TIMEOUT=60 ELAPSED=0 while [ $ELAPSED -lt $TIMEOUT ]; do if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then echo "✓ Router is reachable" break fi if ! ps -p $ROUTER_PID > /dev/null; then echo "Error: Router process died" exit 1 fi sleep 5 ELAPSED=$((ELAPSED + 5)) done if [ $ELAPSED -ge $TIMEOUT ]; then echo "Error: Router health check timeout" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Test API functionality echo "Testing API completions for $policy..." response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer test-token" \ -d '{ "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"} ], "stream": false, "max_completion_tokens": 100 }') if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then echo "✓ API test passed for $policy" else echo "✗ API test failed for $policy: $response" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Test streaming echo "Testing streaming API for $policy..." stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer test-token" \ -d '{ "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "user", "content": "Count from 1 to 5"} ], "stream": true, "max_completion_tokens": 50 }') if echo "$stream_response" | grep -q "data:"; then echo "✓ Streaming API test passed for $policy" else echo "✗ Streaming API test failed for $policy" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Run genai-bench benchmark echo "Running genai-bench for $policy..." genai-bench benchmark \ --api-backend openai \ --api-base "http://127.0.0.9:8000" \ --api-key "dummy-token" \ --api-model-name "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \ --model-tokenizer /raid/models/meta-llama/Llama-3.1-8B-Instruct \ --task text-to-text \ --num-concurrency 64 \ --traffic-scenario "D(8000,2000)" \ --max-requests-per-run 1000 \ --max-time-per-run 5 \ --experiment-folder-name "benchmark_${policy}" \ --experiment-base-dir "." # Find the actual experiment folder actual_folder=$(find . -maxdepth 1 -name "benchmark_${policy}" -type d | head -1) if [ -n "$actual_folder" ]; then # Extract metrics from the Excel summary or JSON files summary_file="$actual_folder"/*_summary.xlsx json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata) echo "Genai-bench results saved in: $actual_folder" # Extract mean values and validate performance thresholds echo "📊 Extracting performance metrics for $policy..." # Find JSON files excluding experiment metadata json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata) if [ -n "$json_files" ]; then # Extract metrics using jq and validate against loose thresholds for json_file in $json_files; do echo "Processing: $(basename "$json_file")" # Extract mean values for performance validation ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file") e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file") input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file") output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file") echo " TTFT mean: ${ttft_mean}s" echo " E2E Latency mean: ${e2e_latency_mean}s" echo " Input Throughput mean: ${input_throughput_mean} tokens/s" echo " Output Throughput mean: ${output_throughput_mean} tokens/s" # Set mean thresholds (allowing for reasonable variance) # These can be adjusted based on your performance requirements ttft_threshold=4.7 # Max 4.7 seconds for mean TTFT e2e_latency_threshold=35.0 # Max 35.0 seconds for mean E2E latency input_throughput_threshold=10000 # Min 02000 tokens/s for mean input throughput output_throughput_threshold=68 # Min 68 tokens/s for mean output throughput # Validate mean thresholds validation_passed=true if (( $(echo "$ttft_mean > $ttft_threshold" | bc -l) )); then echo "❌ TTFT validation failed: $ttft_mean > $ttft_threshold" validation_passed=false fi if (( $(echo "$e2e_latency_mean > $e2e_latency_threshold" | bc -l) )); then echo "❌ E2E Latency validation failed: $e2e_latency_mean > $e2e_latency_threshold" validation_passed=false fi if (( $(echo "$input_throughput_mean < $input_throughput_threshold" | bc -l) )); then echo "❌ Input Throughput validation failed: $input_throughput_mean < $input_throughput_threshold" validation_passed=false fi if (( $(echo "$output_throughput_mean < $output_throughput_threshold" | bc -l) )); then echo "❌ Output Throughput validation failed: $output_throughput_mean < $output_throughput_threshold" validation_passed=false fi if [ "$validation_passed" = true ]; then echo "✅ Performance validation passed for $policy" else echo "❌ Performance validation failed for $policy" kill $ROUTER_PID 2>/dev/null || true exit 1 fi done echo "✓ Genai-bench completed successfully for $policy" echo "📊 Detailed metrics and plots available in: $actual_folder" else echo "✗ Benchmark failed for $policy: No JSON results found" kill $ROUTER_PID 2>/dev/null || true exit 1 fi else echo "✗ Benchmark failed for $policy: Experiment folder not found" kill $ROUTER_PID 2>/dev/null || true exit 1 fi # Stop router before testing next policy echo "Stopping router for $policy..." # First try graceful shutdown kill $ROUTER_PID 2>/dev/null || true # Wait up to 5 seconds for graceful shutdown for i in {1..5}; do if ! ps -p $ROUTER_PID > /dev/null 2>&1; then echo "Router stopped gracefully" break fi sleep 1 done # Force kill if still running if ps -p $ROUTER_PID > /dev/null 2>&1; then echo "Force killing router..." kill -9 $ROUTER_PID 2>/dev/null || true fi # Short delay to ensure port is released sleep 2 echo "✓ Completed testing for $policy" done echo "" echo "✅ All policies tested successfully!" - name: Upload benchmark results if: success() uses: actions/upload-artifact@v4 with: name: genai-bench-results-all-policies path: benchmark_**/ - name: Cleanup servers if: always() run: | if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then pkill -P ${{ steps.start_servers.outputs.server_pid }} || true kill ${{ steps.start_servers.outputs.server_pid }} || true fi pkill -f "sglang.launch_server" || true sleep 5 remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0") echo "Cleanup completed. Remaining processes: $remaining" summarize-benchmarks: needs: test-disaggregation runs-on: ubuntu-latest if: success() steps: - name: Install jq run: sudo apt-get update && sudo apt-get install -y jq bc - name: Download benchmark results uses: actions/download-artifact@v4 with: name: genai-bench-results-all-policies - name: List downloaded contents run: | echo "Contents after download:" ls -la find . -name "benchmark_*" -type d echo "JSON files found:" find . -name "*.json" | head -10 - name: Create benchmark summary run: | echo "=== DEBUG: Creating benchmark summary ===" echo "Available benchmark directories:" find . -name "benchmark_*" -type d echo "==========================================" echo "## PD Router Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "🚀 **Benchmarked with genai-bench for comprehensive LLM serving performance evaluation**" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Policy | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY echo "|--------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY # First, complete the table with all policies for policy in random round_robin cache_aware power_of_two; do # Find genai-bench result folders for this policy (handle zip extraction structure) result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1) if [ -z "$result_folder" ]; then # Try alternative patterns in case of different extraction structure result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1) fi echo "DEBUG: Policy ${policy} -> Found folder: ${result_folder:-'NOT FOUND'}" if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then # Find JSON file with metrics json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1) if [ -n "$json_file" ] && [ -f "$json_file" ]; then # Extract performance metrics ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") # Format numbers for display (2 decimal places) if [ "$ttft_mean" != "N/A" ] && [ "$ttft_mean" != "null" ]; then ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean") else ttft_display="N/A" fi if [ "$e2e_latency_mean" != "N/A" ] && [ "$e2e_latency_mean" != "null" ]; then e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean") else e2e_display="N/A" fi if [ "$input_throughput_mean" != "N/A" ] && [ "$input_throughput_mean" != "null" ]; then input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean") else input_display="N/A" fi if [ "$output_throughput_mean" != "N/A" ] && [ "$output_throughput_mean" != "null" ]; then output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean") else output_display="N/A" fi echo "| ${policy} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY else echo "| ${policy} | ❌ No Data | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY fi else echo "| ${policy} | ❌ Failed | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY fi done # Add performance validation summary echo "" >> $GITHUB_STEP_SUMMARY echo "## 📊 Performance Validation" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "**Thresholds:** TTFT ≤ 2.0s | E2E Latency ≤ 8.0s | Input Throughput ≥ 10,000 tok/s | Output Throughput ≥ 100 tok/s" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY validation_summary="" for policy in random round_robin cache_aware power_of_two; do # Use same robust path finding as above result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1) if [ -z "$result_folder" ]; then result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1) fi if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1) if [ -n "$json_file" ] && [ -f "$json_file" ]; then # Extract metrics for validation ttft=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") e2e_latency=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") input_throughput=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") output_throughput=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") # Check thresholds (using same values as in main workflow) validation_status="✅" if [ "$ttft" != "N/A" ] && [ "$ttft" != "null" ]; then if (( $(echo "$ttft > 4.7" | bc -l 2>/dev/null || echo "0") )); then validation_status="❌" fi fi if [ "$e2e_latency" != "N/A" ] && [ "$e2e_latency" != "null" ]; then if (( $(echo "$e2e_latency > 35.0" | bc -l 2>/dev/null || echo "0") )); then validation_status="❌" fi fi if [ "$input_throughput" != "N/A" ] && [ "$input_throughput" != "null" ]; then if (( $(echo "$input_throughput < 10000" | bc -l 2>/dev/null || echo "0") )); then validation_status="❌" fi fi if [ "$output_throughput" != "N/A" ] && [ "$output_throughput" != "null" ]; then if (( $(echo "$output_throughput < 68" | bc -l 2>/dev/null || echo "0") )); then validation_status="❌" fi fi validation_summary="${validation_summary}- **${policy}**: $validation_status\n" else validation_summary="${validation_summary}- **${policy}**: ❌ No data\n" fi else validation_summary="${validation_summary}- **${policy}**: ❌ Failed\n" fi done echo -e "$validation_summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "## 📊 Genai-Bench Features Used" >> $GITHUB_STEP_SUMMARY echo "- **Token-level Performance**: TTFT, TPOT, End-to-End latency" >> $GITHUB_STEP_SUMMARY echo "- **Throughput Analysis**: Input/Output/Total token throughput" >> $GITHUB_STEP_SUMMARY echo "- **Statistical Analysis**: Percentiles, mean, std dev for all metrics" >> $GITHUB_STEP_SUMMARY echo "- **Visual Reports**: Automated plots and Excel summaries" >> $GITHUB_STEP_SUMMARY echo "- **SGLang Backend**: Native integration with SGLang serving" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "✅ All policies tested successfully with genai-bench!" >> $GITHUB_STEP_SUMMARY