name: PR Test (SMG)

on:
  push:
    branches: [ main ]
    paths:
      - "sgl-router/**"
  pull_request:
    branches: [ main ]
    paths:
      - "sgl-router/**"
    types: [synchronize, labeled]
  workflow_dispatch:

concurrency:
  group: pr-test-rust-${{ github.ref }}
  cancel-in-progress: true

env:
  RUSTC_WRAPPER: sccache
  SCCACHE_GHA_ENABLED: "true"

jobs:
  maturin-build-test:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          path: sglang-repo

      - name: Move sgl-router folder to root
        run: |
          mv sglang-repo/sgl-router/* .
          rm -rf sglang-repo

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.13"

      - name: Install protoc and dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y wget unzip gcc g++ perl make
          cd /tmp
          wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/protoc-32.0-linux-x86_64.zip
          sudo unzip protoc-32.0-linux-x86_64.zip -d /usr/local
          rm protoc-32.0-linux-x86_64.zip
          protoc --version

      - name: Configure sccache
        uses: mozilla-actions/sccache-action@v0.0.9
        with:
          version: "v0.10.0"

      - name: Test maturin build
        uses: PyO3/maturin-action@v1
        with:
          args: --release --out dist --features vendored-openssl
          rust-toolchain: stable
          sccache: true

      - name: List built wheel
        run: ls -lh dist/

      - name: Test wheel install
        run: |
          pip install dist/*.whl
          python -c "import sglang_router; print('Python package: OK')"
          python -c "from sglang_router.sglang_router_rs import Router; print('Rust extension: OK')"
          python -m sglang_router.launch_router --help > /dev/null && echo "Entry point: OK"
  unit-test-rust:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          bash scripts/ci/ci_install_rust.sh

      - name: Configure sccache
        uses: mozilla-actions/sccache-action@v0.0.9
        with:
          version: "v0.10.0"

      - name: Rust cache
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: sgl-router
          cache-all-crates: true
          cache-on-failure: true

      - name: Run lint
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          cargo clippy --all-targets --all-features -- -D warnings

      - name: Run fmt
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          rustup component add --toolchain nightly-x86_64-unknown-linux-gnu rustfmt
          rustup toolchain install nightly --profile minimal
          cargo +nightly fmt -- --check

      - name: Run Rust tests
        timeout-minutes: 20
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          cargo test

      - name: Check benchmark compilation
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          cargo check --benches

      - name: Quick benchmark sanity check
        timeout-minutes: 15
        run: |
          source "$HOME/.cargo/env"
          cd sgl-router/
          # Run quick benchmarks to ensure they work using Python script
          python3 scripts/run_benchmarks.py --quick

      - name: Show sccache stats
        if: always()
        run: sccache --show-stats

  pytest-rust:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: 4-gpu-a10
    timeout-minutes: 32
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install rust dependencies
        run: |
          bash scripts/ci/ci_install_rust.sh

      - name: Configure sccache
        uses: mozilla-actions/sccache-action@v0.0.9
        with:
          version: "v0.10.0"

      - name: Rust cache
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: sgl-router
          cache-all-crates: true
          cache-on-failure: true

      - name: Install SGLang dependencies
        run: |
          sudo --preserve-env=PATH bash scripts/ci/ci_install_dependency.sh

      - name: Build python binding
        run: |
          source "$HOME/.cargo/env"
          export RUSTC_WRAPPER=sccache
          cd sgl-router
          pip install maturin
          maturin build --release --out dist --features vendored-openssl
          pip install --force-reinstall dist/*.whl


      - name: Run Python unit tests
        run: |
          cd sgl-router
          source "$HOME/.cargo/env"
          pip install pytest pytest-cov pytest-xdist
          pytest -q py_test/unit --cov=sglang_router --cov-report=term-missing --cov-fail-under=80

      - name: Run Python integration tests
        run: |
          cd sgl-router
          source "$HOME/.cargo/env"
          # Integration tests use FastAPI/uvicorn for mock workers
          pip install fastapi uvicorn orjson
          pytest -q -m integration

      - name: Run Python E2E tests
        run: |
          bash scripts/killall_sglang.sh "nuk_gpus"
          cd sgl-router
          python3 -m pip --no-cache-dir install --upgrade --ignore-installed blinker
          python3 -m pip --no-cache-dir install --upgrade --break-system-packages genai-bench==0.0.2
          pytest -m e2e -s  -vv -o log_cli=true --log-cli-level=INFO

      - name: Upload benchmark results
        if: success()
        uses: actions/upload-artifact@v4
        with:
          name: genai-bench-results-all-policies
          path: sgl-router/benchmark_**/

  pytest-rust-2:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: 4-gpu-a10
    timeout-minutes: 32
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install rust dependencies
        run: |
          bash scripts/ci/ci_install_rust.sh

      - name: Configure sccache
        uses: mozilla-actions/sccache-action@v0.0.9
        with:
          version: "v0.10.0"

      - name: Rust cache
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: sgl-router
          cache-all-crates: true
          cache-on-failure: true

      - name: Install SGLang dependencies
        run: |
          sudo --preserve-env=PATH bash scripts/ci/ci_install_dependency.sh

      - name: Setup Oracle Instant Client
        run: |
          sudo apt-get install -y unzip
          INSTANT_CLIENT_DIR="/home/ubuntu/instant-client"
          INSTANT_CLIENT_ZIP="instantclient-basic-linux.x64-23.9.0.25.07.zip"

          if [ ! -d "$INSTANT_CLIENT_DIR/instantclient_23_9" ]; then
            echo "Downloading Oracle Instant Client..."
            mkdir -p "$INSTANT_CLIENT_DIR"
            cd "$INSTANT_CLIENT_DIR"
            wget https://download.oracle.com/otn_software/linux/instantclient/2390000/$INSTANT_CLIENT_ZIP
            unzip $INSTANT_CLIENT_ZIP
            rm $INSTANT_CLIENT_ZIP
          else
            echo "Oracle Instant Client already exists, skipping download"
          fi

          echo "LD_LIBRARY_PATH=/home/ubuntu/instant-client/instantclient_23_9:\$LD_LIBRARY_PATH" >> $GITHUB_ENV

      - name: Start Oracle Database
        run: |
          docker run -d -p 1521:1521 -e ORACLE_PASSWORD=oracle --name oracle-db gvenzl/oracle-xe:21-slim
          echo "Starting Oracle DB..."

          # Export Oracle connection environment variables
          echo "ATP_USER=system" >> $GITHUB_ENV
          echo "ATP_PASSWORD=oracle" >> $GITHUB_ENV
          echo "ATP_DSN=localhost:1521/XEPDB1" >> $GITHUB_ENV

      - name: Build python binding
        run: |
          source "$HOME/.cargo/env"
          export RUSTC_WRAPPER=sccache
          cd sgl-router
          pip install maturin
          maturin build --release --out dist --features vendored-openssl
          pip install --force-reinstall dist/*.whl

      - name: Run Python E2E response API tests
        run: |
          bash scripts/killall_sglang.sh "nuk_gpus"
          cd sgl-router
          SHOW_ROUTER_LOGS=1 pytest py_test/e2e_response_api -s -vv -o log_cli=true --log-cli-level=INFO

      - name: Run Python E2E gRPC tests
        run: |
          bash scripts/killall_sglang.sh "nuk_gpus"
          cd sgl-router
          SHOW_ROUTER_LOGS=1 ROUTER_LOCAL_MODEL_PATH="/home/ubuntu/models" pytest py_test/e2e_grpc -s -vv -o log_cli=true --log-cli-level=INFO

      - name: Cleanup Oracle Database
        if: always()
        run: |
          docker stop oracle-db || true
          docker rm oracle-db || true


  finish:
    needs: [maturin-build-test, unit-test-rust, pytest-rust, pytest-rust-2]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."

  summarize-benchmarks:
    needs: pytest-rust
    runs-on: ubuntu-latest
    if: success()

    steps:
    - name: Install jq
      run: sudo apt-get update && sudo apt-get install -y jq bc

    - name: Download benchmark results
      uses: actions/download-artifact@v4
      with:
        name: genai-bench-results-all-policies

    - name: List downloaded contents
      run: |
        echo "Contents after download:"
        ls -la
        find . -name "benchmark_*" -type d
        echo "JSON files found:"
        find . -name "*.json" | head -10

    - name: Create benchmark summary
      run: |
        echo "=== DEBUG: Creating benchmark summary ==="
        echo "Available benchmark directories:"
        find . -name "benchmark_*" -type d || true
        echo "=========================================="

        echo "## Router E2E Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "Results captured from E2E tests for two scenarios: regular router (2 workers, dp=2) and PD router (2 prefill + 2 decode)." >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "| Scenario | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
        echo "|----------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY

        scenarios=$'Regular (dp=2, round_robin)|benchmark_round_robin_regular\nPD (2 prefill + 2 decode, round_robin)|benchmark_round_robin_pd'

        echo "$scenarios" | sed 's/^\s*//' | while IFS='|' read -r label pattern; do
          [ -z "$label" ] && continue
          # Find the result folder (handle different extraction layouts)
          result_folder=$(find . -maxdepth 3 \( -name "$pattern" -o -path "*${pattern}*" \) -type d | head -1)

          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)

            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")

              ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
              e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
              input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
              output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")

              echo "| ${label} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY

              # Optional GPU utilization table if monitor output exists
              gpu_json="$result_folder/gpu_utilization.json"
              if [ -f "$gpu_json" ]; then
                overall_mean=$(jq -r '.overall.mean // 0' "$gpu_json")
                printf "\n#### GPU Utilization — %s\n\n" "$label" >> $GITHUB_STEP_SUMMARY
                printf "Overall mean: %.2f%%\n\n" "$overall_mean" >> $GITHUB_STEP_SUMMARY
                echo "| GPU | Mean (%) | p5 | p10 | p25 | p50 | p75 | p90 | p95 |" >> $GITHUB_STEP_SUMMARY
                echo "|-----|----------|----|-----|-----|-----|-----|-----|-----|" >> $GITHUB_STEP_SUMMARY
                jq -r '
                  .per_gpu
                  | to_entries[]
                  | [ .key,
                      (.value.mean // 0),
                      (.value.p5 // 0),
                      (.value.p10 // 0),
                      (.value.p25 // 0),
                      (.value.p50 // 0),
                      (.value.p75 // 0),
                      (.value.p90 // 0),
                      (.value.p95 // 0)
                    ]
                  | @tsv' "$gpu_json" \
                  | while IFS=$'\t' read -r gpu m p5 p10 p25 p50 p75 p90 p95; do
                      printf "| %s | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f |\n" "$gpu" "$m" "$p5" "$p10" "$p25" "$p50" "$p75" "$p90" "$p95" >> $GITHUB_STEP_SUMMARY
                    done
                echo "" >> $GITHUB_STEP_SUMMARY
              fi
            fi
          fi
        done