Unverified Commit 6aa94b96 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update ci workflows (#1804)

parent c2650748
name: Build Documentation
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:
jobs:
execute-notebooks:
runs-on: 1-gpu-runner
if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
......@@ -23,22 +18,14 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
bash scripts/ci_install_dependency.sh
pip install -r docs/requirements.txt
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Setup Jupyter Kernel
run: |
python -m ipykernel install --user --name python3 --display-name "Python 3"
- name: Execute notebooks
env:
HF_HOME: /hf_home
SGLANG_IS_IN_CI: true
CUDA_VISIBLE_DEVICES: 0
run: |
cd docs/en
for nb in *.ipynb; do
......@@ -54,34 +41,18 @@ jobs:
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
runs-on: 1-gpu-runner
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Cache Python dependencies
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
bash scripts/ci_install_dependency.sh
pip install -r docs/requirements.txt
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Install Pandoc
run: |
apt-get update
apt-get install -y pandoc
......
......@@ -8,10 +8,10 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
python-version: '3.9'
- name: Install pre-commit hook
run: |
......
......@@ -24,9 +24,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Nightly gsm8k Accuracy
timeout-minutes: 60
......
......@@ -27,10 +27,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[dev]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
......@@ -47,10 +44,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[dev]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
......@@ -67,10 +61,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[dev]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
......@@ -87,10 +78,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[dev]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
......@@ -107,10 +95,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[dev]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
......@@ -127,10 +112,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Benchmark Single Latency
timeout-minutes: 10
......@@ -165,10 +147,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Benchmark Offline Throughput (w/o RadixAttention)
timeout-minutes: 10
......@@ -197,10 +176,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
- name: Benchmark Offline Throughput (TP=2)
timeout-minutes: 10
......@@ -229,10 +205,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
......@@ -253,10 +226,7 @@ jobs:
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
bash scripts/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
......
name: Release GitHub
on:
workflow_dispatch:
jobs:
publish:
if: github.repository == 'sgl-project/sglang'
runs-on: ubuntu-latest
environment: 'prod'
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Get version
id: get_version
run: |
version=$(cat python/sglang/version.py | cut -d'"' -f2)
echo "TAG=v$version" >> $GITHUB_OUTPUT
- name: Release
uses: softprops/action-gh-release@v1
env:
GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
with:
name: Release ${{ steps.get_version.outputs.TAG }}
tag_name: ${{ steps.get_version.outputs.TAG }}
......@@ -13,12 +13,14 @@ jobs:
runs-on: ubuntu-latest
environment: 'prod'
steps:
- name: Set up python3.8
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.8'
python-version: '3.9'
- name: Checkout repository
uses: actions/checkout@v3
- name: Upload to pypi
run: |
cd python
......
SPHINXOPTS =
SPHINXBUILD = sphinx-build
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
clean:
rm -rf $(BUILDDIR)/*
# SGLang Documentation
## Build the documentation website
### Dependency
```
pip install -r requirements.txt
```
### Build
```
make html
```
### Clean
To remove all generated files:
```
make clean
```
### Serve (preview)
Run an HTTP server and visit http://localhost:8000 in your browser.
```
python3 -m http.server --d _build/html
```
### Deploy
Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.io) and make sure you have write access.
```bash
export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path
python3 deploy.py
```
\ No newline at end of file
......@@ -20,7 +20,7 @@ curl http://localhost:30000/generate \
}'
```
Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
Learn more about the argument specification, streaming, and multi-modal support [here](https://sgl-project.github.io/sampling_params.html).
### OpenAI Compatible API
In addition, the server supports OpenAI-compatible APIs.
......@@ -74,7 +74,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
```
- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
- See [hyperparameter tuning](https://sgl-project.github.io/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
......@@ -83,7 +83,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html).
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
```
# Node 0
......@@ -158,7 +158,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
- gte-Qwen2
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
Instructions for supporting a new model are [here](https://sgl-project.github.io/model_support.html).
#### Use Models From ModelScope
<details>
......
......@@ -3,7 +3,7 @@ import sys
sys.path.insert(0, os.path.abspath("../.."))
version_file = "../../python/sglang/version.py"
version_file = "../python/sglang/version.py"
with open(version_file, "r") as f:
exec(compile(f.read(), version_file, "exec"))
__version__ = locals()["__version__"]
......
......@@ -11,4 +11,4 @@ pre-commit run --all-files
```
## Add Unit Tests
Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
Add unit tests under [sglang/test](https://github.com/sgl-project/sglang/tree/main/test). You can learn how to add and run tests from the README.md in that folder.
# Custom Chat Template in SGLang Runtime
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](../../python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](../../python/sglang/lang/chat_template.py)).
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
It should just work for most official models such as Llama-2/Llama-3.
......
#!/usr/bin/python3
import os
from datetime import datetime
def run_cmd(cmd):
print(cmd)
os.system(cmd)
run_cmd("cd $DOC_SITE_PATH; git pull")
# (Optional) Remove old files
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
run_cmd(
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment