name: Test Disaggregation Mode

on:
  push:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  pull_request:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  workflow_dispatch:

concurrency:
  group: test-disaggregation-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  test-disaggregation:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: [h200]
    timeout-minutes: 45

    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        fetch-depth: 10

    - name: Setup Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.11'

    - name: Setup Rust
      run: |
        bash scripts/ci_install_rust.sh

    - name: Cache Rust dependencies
      uses: actions/cache@v4
      with:
        path: |
          ~/.cargo/bin/
          ~/.cargo/registry/index/
          ~/.cargo/registry/cache/
          ~/.cargo/git/db/
          sgl-router/target/
        key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
        restore-keys: |
          ${{ runner.os }}-cargo-

    - name: Cache pip dependencies
      uses: actions/cache@v4
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
        restore-keys: |
          ${{ runner.os }}-pip-

    - name: Validate environment
      run: |
        echo "=== System Validation ==="
        nvidia-smi
        echo "GPU count: $(nvidia-smi -L | wc -l)"
        if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
          echo "Error: This test requires at least 8 GPUs"
          exit 1
        fi

        echo "=== RDMA Validation ==="
        if ! command -v ibv_devices >/dev/null 2>&1; then
          echo "Error: InfiniBand tools not found"
          exit 1
        fi

        # Check for active IB devices
        found_active_device=false
        for device in mlx5_{0..11}; do
            if ibv_devinfo $device >/dev/null 2>&1; then
                state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
                if [[ "$state" == "PORT_ACTIVE" ]]; then
                    echo "✓ Found active device: $device"
                    found_active_device=true
                    break
                fi
            fi
        done

        if [ "$found_active_device" = false ]; then
          echo "Error: No active IB devices found"
          echo "Available devices:"
          ibv_devices || true
          exit 1
        fi

        echo "=== Model Validation ==="
        if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
          echo "Error: Model not found"
          ls -la /raid/models/ || echo "No models directory"
          exit 1
        fi
        echo "✓ Model found"

    - name: Install SGLang dependencies
      run: |
        echo "Installing SGLang with all extras..."
        python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.4.post1

    - name: Build and install sgl-router
      run: |
        source "$HOME/.cargo/env"
        echo "Building sgl-router..."
        cd sgl-router
        cargo build && python3 -m build && pip install --force-reinstall dist/*.whl

    - name: Start disaggregation servers
      id: start_servers
      run: |
        echo "Starting disaggregation servers..."
        bash scripts/ci_start_disaggregation_servers.sh &
        SERVER_PID=$!
        echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT

        echo "Waiting for router to become healthy..."
        TIMEOUT=300
        ELAPSED=0
        while [ $ELAPSED -lt $TIMEOUT ]; do
          if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
            echo "✓ Router is reachable"
            break
          fi
          if ! ps -p $SERVER_PID > /dev/null; then
            echo "Error: Server processes failed to start"
            exit 1
          fi
          echo "Waiting for router... (${ELAPSED}s/${TIMEOUT}s)"
          sleep 10
          ELAPSED=$((ELAPSED + 10))
        done

        if [ $ELAPSED -ge $TIMEOUT ]; then
          echo "Error: Router health check timeout after ${TIMEOUT}s"
          exit 1
        fi

        echo "✓ Servers started and healthy (PID: $SERVER_PID)"

    - name: Test API functionality
      timeout-minutes: 5
      run: |
        BASE_URL="http://127.0.0.9:8000"

        echo "Testing API completions..."
        response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
          -H "Content-Type: application/json" \
          -H "Authorization: Bearer test-token" \
          -d '{
            "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
            "messages": [
              {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
            ],
            "stream": false,
            "max_tokens": 100
          }')

        if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
          echo "✓ API test passed"
        else
          echo "✗ API test failed: $response"
          exit 1
        fi

        echo "Testing streaming API..."
        stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
          -H "Content-Type: application/json" \
          -H "Authorization: Bearer test-token" \
          -d '{
            "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
            "messages": [
              {"role": "user", "content": "Count from 1 to 5"}
            ],
            "stream": true,
            "max_tokens": 50
          }')

        if echo "$stream_response" | grep -q "data:"; then
          echo "✓ Streaming API test passed"
        else
          echo "✗ Streaming API test failed"
          exit 1
        fi

    - name: Run benchmark test
      timeout-minutes: 5
      run: |
        echo "Running benchmark test..."
        benchmark_output=$(python3 -m sglang.bench_one_batch_server \
          --model-path "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
          --base-url "http://127.0.0.9:8000" \
          --batch-size 8 \
          --input-len 4096 \
          --output-len 5 \
          --skip-warmup)

        echo "$benchmark_output"

        # Extract metrics from output
        latency=$(echo "$benchmark_output" | grep "latency:" | awk '{print $2}' | sed 's/s//')
        input_throughput=$(echo "$benchmark_output" | grep "input throughput:" | awk '{print $3}')
        output_throughput=$(echo "$benchmark_output" | grep "output throughput:" | awk '{print $3}')

        # Validate performance (latency<1.5s, input>20k, output>1k)
        command -v bc >/dev/null || (apt-get update && apt-get install -y bc)

        echo "Performance: ${latency}s | ${input_throughput} | ${output_throughput} tok/s"

        fail=""
        (( $(echo "$latency > 1.5" | bc -l) )) && fail="Latency too high (${latency}s>1.5s) "
        (( $(echo "$input_throughput < 20000" | bc -l) )) && fail="${fail}Input too low (${input_throughput}<20k) "
        (( $(echo "$output_throughput < 1000" | bc -l) )) && fail="${fail}Output too low (${output_throughput}<1k) "

        if [ -n "$fail" ]; then
          echo "✗ Benchmark failed: $fail"
          exit 1
        else
          echo "✓ Performance validation passed"
        fi

    - name: Cleanup servers
      if: always()
      run: |
        if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
          pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
          kill ${{ steps.start_servers.outputs.server_pid }} || true
        fi
        pkill -f "sglang.launch_server" || true
        sleep 5
        remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
        echo "Cleanup completed. Remaining processes: $remaining"