name: PR E2E Test on: push: branches: [ main ] pull_request: branches: [ main ] workflow_dispatch: jobs: gpu-job: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: 6 steps: - name: Checkout code uses: actions/checkout@v3 - name: Install dependencies run: | pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall pip install --upgrade transformers - name: Launch server and run benchmark run: | python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 & echo "Waiting for server to start..." for i in {1..60}; do if curl -s http://127.0.0.1:8413/health; then echo "Server is up!" break fi if [ $i -eq 60 ]; then echo "Server failed to start within 60 seconds" exit 1 fi sleep 1 done python3 -m sglang.bench_serving --backend sglang --port 8413 echo "Stopping server..." kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}')