name: PR E2E Test

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
  workflow_dispatch:

jobs:
  gpu-job:
    runs-on: self-hosted
    env:
      CUDA_VISIBLE_DEVICES: 6

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Install dependencies
      run: |
        pip install --upgrade pip
        pip install -e "python[all]"
        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
        pip install --upgrade transformers

    - name: Launch server and run benchmark
      run: |
        python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 &

        echo "Waiting for server to start..."
        for i in {1..60}; do
          if curl -s http://127.0.0.1:8413/health; then
            echo "Server is up!"
            break
          fi
          if [ $i -eq 60 ]; then
            echo "Server failed to start within 60 seconds"
            exit 1
          fi
          sleep 1
        done

        python3 -m sglang.bench_serving --backend sglang --port 8413

        echo "Stopping server..."
        kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}')