Update ci workflows (#1804)

6aa94b96 · Lianmin Zheng · GitHub · c2650748 · 6aa94b96 · 6aa94b96
Unverified Commit 6aa94b96 authored Oct 26, 2024 by Lianmin Zheng Committed by GitHub Oct 26, 2024
20 changed files
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
 name: Build Documentation
 on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
  workflow_dispatch:
 jobs:
  execute-notebooks:
    runs-on: 1-gpu-runner
-    if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
-        with:
+        uses: actions/checkout@v3
-          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v4
@@ -23,22 +18,14 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
          pip install -r docs/requirements.txt
-          pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Setup Jupyter Kernel
        run: |
          python -m ipykernel install --user --name python3 --display-name "Python 3"
      - name: Execute notebooks
-        env:
-          HF_HOME: /hf_home
-          SGLANG_IS_IN_CI: true
-          CUDA_VISIBLE_DEVICES: 0
        run: |
          cd docs/en
          for nb in *.ipynb; do
@@ -54,34 +41,18 @@ jobs:
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: 1-gpu-runner
    steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
-        with:
+        uses: actions/checkout@v3
-          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.9'
-      - name: Cache Python dependencies
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pip
-          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-pip-
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
          pip install -r docs/requirements.txt
-          pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-      - name: Install Pandoc
-        run: |
          apt-get update
          apt-get install -y pandoc

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -8,10 +8,10 @@ jobs:
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: '3.9'
      - name: Install pre-commit hook
        run: |

--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -24,9 +24,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Nightly gsm8k Accuracy
        timeout-minutes: 60

--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -27,10 +27,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[dev]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 10
@@ -47,10 +44,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[dev]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -67,10 +61,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[dev]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -87,10 +78,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[dev]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -107,10 +95,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[dev]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -127,10 +112,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Single Latency
        timeout-minutes: 10
@@ -165,10 +147,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Offline Throughput (w/o RadixAttention)
        timeout-minutes: 10
@@ -197,10 +176,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Offline Throughput (TP=2)
        timeout-minutes: 10
@@ -229,10 +205,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
@@ -253,10 +226,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
-          pip install -e "python[all]"
-          pip install transformers==4.45.2
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval

--- a/.github/workflows/release-github.yml
+++ b/.github/workflows/release-github.yml
-name: Release GitHub
-on:
-  workflow_dispatch:
-jobs:
-  publish:
-    if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
-    environment: 'prod'
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Get version
-        id: get_version
-        run: |
-          version=$(cat python/sglang/version.py | cut -d'"' -f2)
-          echo "TAG=v$version" >> $GITHUB_OUTPUT
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
-        with:
-          name: Release ${{ steps.get_version.outputs.TAG }}
-          tag_name: ${{ steps.get_version.outputs.TAG }}
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -13,12 +13,14 @@ jobs:
    runs-on: ubuntu-latest
    environment: 'prod'
    steps:
-      - name: Set up python3.8
+      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.9'
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Upload to pypi
        run: |
          cd python

--- a/docs/en/Makefile
+++ b/docs/en/Makefile
-SPHINXOPTS    =
+# Minimal makefile for Sphinx documentation
-SPHINXBUILD   = sphinx-build
+#
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
+# Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+clean:
+	rm -rf $(BUILDDIR)/*
--- a/docs/README.md
+++ b/docs/README.md
+# SGLang Documentation
+## Build the documentation website
+### Dependency
+```
+pip install -r requirements.txt
+```
+### Build
+```
+make html
+```
+### Clean
+To remove all generated files:
+```
+make clean
+```
+### Serve (preview)
+Run an HTTP server and visit http://localhost:8000 in your browser.
+```
+python3 -m http.server --d _build/html
+```
+### Deploy
+Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.io) and make sure you have write access.
+```bash
+export DOC_SITE_PATH=../../sgl-project.github.io   # update this with your path
+python3 deploy.py
+```
\ No newline at end of file
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
--- a/docs/en/_static/image/logo.png
+++ b/docs/en/_static/image/logo.png
--- a/docs/en/backend.md
+++ b/docs/en/backend.md
@@ -20,7 +20,7 @@ curl http://localhost:30000/generate \
  }'
 ```
-Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
+Learn more about the argument specification, streaming, and multi-modal support [here](https://sgl-project.github.io/sampling_params.html).
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -74,7 +74,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
 ```
- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
+- See [hyperparameter tuning](https://sgl-project.github.io/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
 - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
@@ -83,7 +83,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
+- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html).
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
 ```
 # Node 0
@@ -158,7 +158,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
 - gte-Qwen2
  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
-Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
+Instructions for supporting a new model are [here](https://sgl-project.github.io/model_support.html).
 #### Use Models From ModelScope
 <details>

--- a/docs/en/benchmark_and_profiling.md
+++ b/docs/en/benchmark_and_profiling.md
--- a/docs/en/choices_methods.md
+++ b/docs/en/choices_methods.md
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -3,7 +3,7 @@ import sys
 sys.path.insert(0, os.path.abspath("../.."))
-version_file = "../../python/sglang/version.py"
+version_file = "../python/sglang/version.py"
 with open(version_file, "r") as f:
    exec(compile(f.read(), version_file, "exec"))
 __version__ = locals()["__version__"]

--- a/docs/en/contributor_guide.md
+++ b/docs/en/contributor_guide.md
@@ -11,4 +11,4 @@ pre-commit run --all-files
 ```
 ## Add Unit Tests
-Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
+Add unit tests under [sglang/test](https://github.com/sgl-project/sglang/tree/main/test). You can learn how to add and run tests from the README.md in that folder.
--- a/docs/en/custom_chat_template.md
+++ b/docs/en/custom_chat_template.md
 # Custom Chat Template in SGLang Runtime
-**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](../../python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](../../python/sglang/lang/chat_template.py)).
+**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
 By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
 It should just work for most official models such as Llama-2/Llama-3.

--- a/docs/deploy.py
+++ b/docs/deploy.py
+#!/usr/bin/python3
+import os
+from datetime import datetime
+def run_cmd(cmd):
+    print(cmd)
+    os.system(cmd)
+run_cmd("cd $DOC_SITE_PATH; git pull")
+# (Optional) Remove old files
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
+cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+run_cmd(
+    f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
+)
--- a/docs/deploy_docs.sh
+++ b/docs/deploy_docs.sh
--- a/docs/en/embedding_model.ipynb
+++ b/docs/en/embedding_model.ipynb
--- a/docs/en/frontend.md
+++ b/docs/en/frontend.md