sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

118f1fc7 · maxiao1 · 118f1fc7 · 118f1fc7 · 118f1fc7 · 118f1fc7
Commit 118f1fc7 authored Sep 13, 2025 by maxiao1
20 changed files
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
+sgl-kernel/3rdparty/tensorrt_llm/*
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
+FROM lmsysorg/sglang:dev
+
+# Create non-root user with specified UID and GID
+# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
+ARG HOST_UID=1003
+ARG HOST_GID=1003
+RUN groupadd -g $HOST_GID devuser && \
+    useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
+
+# Give devuser sudo access
+RUN apt-get update && apt-get install -y sudo && \
+    echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
+
+# Set up oh-my-zsh for devuser
+RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
+    cp /root/.zshrc /home/devuser/.zshrc && \
+    cp /root/.vimrc /home/devuser/.vimrc && \
+    cp /root/.tmux.conf /home/devuser/.tmux.conf && \
+    sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
+    chown -R devuser:devuser /home/devuser/
+
+# Set workspace directory and ownership
+WORKDIR /sgl-workspace/sglang
+RUN chown -R devuser:devuser /sgl-workspace
+
+# Switch to devuser
+USER devuser
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
+{
+    "name": "sglang",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "remoteUser": "devuser",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                // Python development
+                "ms-python.python",
+                "charliermarsh.ruff",
+                // Rust development
+                "rust-lang.rust-analyzer",
+                "tamasfe.even-better-toml"
+            ]
+        }
+    },
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // The two lines below ensures that your local changes in the sglang
+    // repo is automatically synced to the sglang pip package installed
+    // in the dev docker container. You can remove / comment out these
+    // two lines if you prefer to sync code changes manually.
+    "workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/sglang,type=bind",
+    "workspaceFolder": "/sgl-workspace/sglang"
+}
--- a/.editorconfig
+++ b/.editorconfig
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.{json,yaml,yml}]
+indent_size = 2
+
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+
+[Makefile]
+indent_style = tab
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
+.github @merrymercy @zhyncs
+/docker @zhyncs @HaiShaw @ByronHsu
+/python/pyproject.toml @merrymercy @zhyncs
+/python/sglang/* @merrymercy @Ying1123 @zhyncs @hnyls2002
+/python/sglang/srt/constrained @hnyls2002
+/python/sglang/srt/disaggregation @ByronHsu @hnyls2002
+/python/sglang/srt/disaggregation/mooncake @ShangmingCai
+/python/sglang/srt/distributed @yizhang2077 @merrymercy
+/python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237 @merrymercy
+/python/sglang/srt/eplb @fzyzcjy
+/python/sglang/srt/function_call @CatherineSue
+/python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock @HaiShaw @ch-wan @BBuf @kushanam @Edwardf0t1
+/python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang
+/python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock
+/python/sglang/srt/multimodal @mickqian @JustinTong0323
+/python/sglang/srt/speculative @Ying1123 @merrymercy @rkooo567 @kssteven418
+/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw
+/sgl-router @slin1237 @ByronHsu
+/test/srt/test_modelopt* @Edwardf0t1
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 5. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here with `python3 -m sglang.check_env`. Otherwise the issue will be closed.
+    placeholder: Environment here.
+  validations:
+    required: true
--- a/.github/ISSUE_TEMPLATE/2-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 2. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
--- a/.github/REVIEWERS.md
+++ b/.github/REVIEWERS.md
+# Area Reviewer
+
+Here are some reviewers for common areas. You can ping them to review your code if you touch related parts.
+
+## Hardware platforms
+- general @Alcanderian
+- AMD GPU @HaiShaw
+- Blackwell GPU @kushanam @trevor-m @zhyncs
+- CPU @mingfeima
+
+## Kernel
+- general @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @HaiShaw
+- triton attention backend @ispobock
+- aiter attention backend @HaiShaw @kkHuang-amd @valarLip
+- flash attention backend @hebiao064
+- flashinfer attention backend @Fridge003
+- moe kernel @BBuf @fzyzcjy @ch-wan @Alcanderian
+
+## Scheduler and memory pool
+- general @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+- constrained decoding @hnyls2002
+- hierarchical cache @xiezhq-hermann @DarkSharpness
+- lora @Fridge003 @Ying1123 @lifuhuang
+- speculative decoding @merrymercy @Ying1123 @kssteven418 @Qiaolin-Yu
+- sliding window attention @hanming-lu
+
+## Parallelism
+- expert parallelism @fzyzcjy @ch-wan
+- data parallelism attention @ch-wan
+- pipeline parallelism @Ying1123
+- tensor parallelism @merrymercy
+
+## PD disaggregation
+- general @ByronHsu @ShangmingCai @hnyls2002
+- Mooncake backend @ShangmingCai
+
+## Build and release
+- general @zhyncs @merrymercy
+
+## API Server
+- general @CatherineSue @slin1237 @ispobock
+- function calling and reasoning parsing @CatherineSue
+- OpenAI API @CatherineSue @slin1237
+
+## SGL-Router
+- general @slin1237 @ByronHsu
+
+## Model
+- multimodal models @mickqian @JustinTong0323
+- other new models @zhaochenyang20
+
+## Reinforcment learning
+- general @zhaochenyang20 @hebiao064 @fzyzcjy @zhuzilin
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
+<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclear, submit your PR and reach out to maintainers for assistance. Join our Slack community at https://slack.sglang.ai to discuss further. -->
+
+## Motivation
+
+<!-- Describe the purpose and goals of this pull request. -->
+
+## Modifications
+
+<!-- Detail the changes made in this pull request. -->
+
+## Accuracy Tests
+
+<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
+
+## Benchmarking and Profiling
+
+<!-- If this pull request impacts inference speed, provide benchmarking and profiling results. -->
+
+## Checklist
+
+- [ ] Format your code according to the [Format code with pre-commit](https://docs.sglang.ai/developer_guide/contribution_guide.html#format-code-with-pre-commit).
+- [ ] Add unit tests according to the [Run and add unit tests](https://docs.sglang.ai/developer_guide/contribution_guide.html#run-and-add-unit-tests).
+- [ ] Update documentation according to [Write documentations](https://docs.sglang.ai/developer_guide/contribution_guide.html#write-documentations).
+- [ ] Provide accuracy and speed benchmark results according to [Test the accuracy](https://docs.sglang.ai/developer_guide/contribution_guide.html#test-the-accuracy) and [Benchmark the speed](https://docs.sglang.ai/developer_guide/contribution_guide.html#benchmark-the-speed).
--- a/.github/workflows/cancel-all-pending-pr-test-runs.yml
+++ b/.github/workflows/cancel-all-pending-pr-test-runs.yml
+name: Cancel All Pending PR Test Runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflows:
+        description: 'Space-separated list of workflow filenames to cancel'
+        required: true
+        type: string
+        default: 'pr-test.yml pr-test-xeon.yml'
+
+permissions:
+  actions: write   # Needed to cancel runs
+  contents: read   # Needed to read repo info
+
+jobs:
+  cancel-pending:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install GitHub CLI
+        run: sudo apt-get install -y gh jq
+
+      - name: Cancel all pending/waiting runs for specified workflows
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+        run: |
+          # Read the space-separated string from the input into a bash array
+          WORKFLOW_FILES=(${{ github.event.inputs.workflows }})
+
+          echo "Targeting ${#WORKFLOW_FILES[@]} workflow(s): ${{ github.event.inputs.workflows }}"
+
+          for workflow_file in "${WORKFLOW_FILES[@]}"; do
+            echo "--- Checking workflow: $workflow_file ---"
+            gh run list \
+              --repo "$REPO" \
+              --workflow "$workflow_file" \
+              --json databaseId,status \
+              --limit 1000 \
+              | jq -r '.[] | select(.status=="queued" or .status=="in_progress") | .databaseId' \
+              | while read run_id; do
+                  echo "Cancelling run ID: $run_id for workflow: $workflow_file"
+                  gh run cancel "$run_id" --repo "$REPO"
+                done
+          done
--- a/.github/workflows/cancel-pr-workflow-on-merge.yml
+++ b/.github/workflows/cancel-pr-workflow-on-merge.yml
+name: Cancel PR Workflows on Merge
+
+on:
+  pull_request_target:
+    types:
+      - closed
+
+permissions:
+  actions: write
+
+jobs:
+  cancel:
+    if: github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cancel Previous Runs
+        uses: styfle/cancel-workflow-action@0.12.1
+        with:
+          workflow_id: all
+          access_token: ${{ secrets.GITHUB_TOKEN }}
+          ignore_sha: true
+          pr_number: ${{ github.event.pull_request.number }}
--- a/.github/workflows/close-inactive-issues.yml
+++ b/.github/workflows/close-inactive-issues.yml
+name: Close Inactive Issues
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+permissions:
+  issues: write
+  contents: read
+
+jobs:
+  close-inactive-issues:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check and close inactive issues
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
+
+            const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
+            console.log(`Owner: ${owner}, Repo: ${repo}`);
+
+            async function fetchIssues(page = 1) {
+              console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
+              return await github.rest.issues.listForRepo({
+                owner,
+                repo,
+                state: 'open',
+                sort: 'updated',
+                direction: 'asc',
+                per_page: 100,
+                page: page
+              });
+            }
+
+            async function processIssues() {
+              console.log('Starting to process issues');
+              console.log(`Repository: ${owner}/${repo}`);
+
+              let page = 1;
+              let hasMoreIssues = true;
+              while (hasMoreIssues) {
+                try {
+                  const issues = await fetchIssues(page);
+                  console.log(`Fetched ${issues.data.length} issues on page ${page}`);
+
+                  if (issues.data.length === 0) {
+                    hasMoreIssues = false;
+                    break;
+                  }
+
+                  for (const issue of issues.data) {
+                    // Skip if the issue has 'good first issue' label
+                    if (issue.labels.some(label => label.name === 'good first issue')) {
+                      console.log(`Skipping issue #${issue.number} as it's marked as 'good first issue'`);
+                      continue;
+                    }
+                    if (new Date(issue.updated_at) < sixtyDaysAgo) {
+                      try {
+                        await github.rest.issues.update({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          state: 'closed',
+                          labels: [...issue.labels.map(l => l.name), 'inactive']
+                        });
+                        await github.rest.issues.createComment({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          body: 'This issue has been automatically closed due to inactivity. Please feel free to reopen it if needed.'
+                        });
+                        console.log(`Closed issue #${issue.number} due to inactivity.`);
+                      } catch (error) {
+                        console.error(`Failed to close issue #${issue.number}: ${error.message}`);
+                      }
+                    } else {
+                      console.log(`Issue #${issue.number} is still active. Stopping processing.`);
+                      hasMoreIssues = false;
+                      break;
+                    }
+                  }
+                  page += 1;
+                } catch (error) {
+                  console.error(`Error fetching issues on page ${page}: ${error.message}`);
+                  hasMoreIssues = false;
+                }
+              }
+              console.log('Finished processing issues');
+            }
+
+            await processIssues();
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
+name: Execute Notebooks
+
+on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "docs/**"
+  workflow_dispatch:
+
+
+concurrency:
+  group: execute-notebook-${{ github.ref }}
+  cancel-in-progress: true
+
+
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+          apt-get update && apt-get install -y pandoc parallel retry
+          ln -sf "$(which python3)" /usr/bin/python
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        timeout-minutes: 40
+        run: |
+          cd docs
+          make clean
+          make compile
+
+
+  notebook-finish:
+    needs: [
+      run-all-notebooks
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/experiment-runner.yml
+++ b/.github/workflows/experiment-runner.yml
+name: Experiment Runner
+
+on:
+  workflow_dispatch:
+    inputs:
+      script:
+        description: "Experiment Runner Script"
+        default: "configs/sharegpt_config.yaml"
+
+concurrency:
+  group: experiment-runner-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  experiment-runner-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Test experiment runner
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 experiment_runner.py --config ${{ inputs.script }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install pre-commit hook
+        run: |
+          python -m pip install pre-commit
+          pre-commit install
+
+      - name: Linting
+        run: pre-commit run --all-files --show-diff-on-failure
--- a/.github/workflows/nightly-test-amd.yml
+++ b/.github/workflows/nightly-test-amd.yml
+name: Nightly Test (AMD)
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-amd-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    strategy:
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2-nightly]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Nightly Test
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/nightly-test.yml
+++ b/.github/workflows/nightly-test.yml
+name: Nightly Test
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite nightly --timeout-per-file 3600
--- a/.github/workflows/open-pr-copy-from-oss.yml
+++ b/.github/workflows/open-pr-copy-from-oss.yml
+name: Open A PR to Copy Code From OSS
+
+on:
+  workflow_dispatch:
+  # schedule:
+  #   - cron: '0 10 * * *'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: 'main'
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy from OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_from_oss.py
--- a/.github/workflows/open-pr-copy-to-oss.yml
+++ b/.github/workflows/open-pr-copy-to-oss.yml
+name: Open A PR to Copy Diff To OSS
+
+on:
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'The commit SHA to copy. Defaults to LAST to copy the latest commit.'
+        required: false
+        default: 'LAST'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy to OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_to_oss.py --commit ${{ github.event.inputs.commit_sha }}
--- a/.github/workflows/pr-benchmark-rust.yml
+++ b/.github/workflows/pr-benchmark-rust.yml
+name: PR Benchmark (Rust Router)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: pr-benchmark-rust-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+jobs:
+  # Quick check job that always runs on PRs
+  benchmark-compile-check:
+    name: Benchmark Compilation Check
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Check benchmarks compile
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo check --benches
+
+  # Full benchmark jobs that only run with label or on main branch
+  benchmark-request-processing:
+    name: Request Processing Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          # Fetch enough history for baseline comparison
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run request processing benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          # Run only the summary benchmark for quick validation in PRs
+          cargo bench --bench request_processing -- benchmark_summary --exact
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: request-processing-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/benchmark_summary/
+          retention-days: 30
+
+  benchmark-tokenizer:
+    name: Tokenizer Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tokenizer benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tokenizer_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tokenizer-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tokenizer*/
+          retention-days: 30
+
+  benchmark-tool-parser:
+    name: Tool Parser Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tool parser benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tool_parser_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tool-parser-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tool_parser*/
+          retention-days: 30
+
+  benchmark-summary:
+    name: Benchmark Summary
+    needs: [benchmark-request-processing, benchmark-tokenizer, benchmark-tool-parser]
+    if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download all benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: '*-results-${{ github.sha }}'
+          path: benchmark-results
+
+      - name: Generate summary
+        run: |
+          echo "## Benchmark Results Summary" > summary.md
+          echo "" >> summary.md
+          echo "### Request Processing" >> summary.md
+          if [ -d "benchmark-results/request-processing-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tokenizer" >> summary.md
+          if [ -d "benchmark-results/tokenizer-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tool Parser" >> summary.md
+          if [ -d "benchmark-results/tool-parser-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          cat summary.md
+
+      - name: Upload summary
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-summary-${{ github.sha }}
+          path: summary.md
+          retention-days: 30