Unverified Commit 706bd69c authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Clean up server_args.py to have a dedicated function for model specific adjustments (#8983)

parent 23f2afb2
...@@ -26,10 +26,7 @@ jobs: ...@@ -26,10 +26,7 @@ jobs:
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install -r docs/requirements.txt pip install -r docs/requirements.txt
apt-get update apt-get update && apt-get install -y pandoc parallel retry
apt-get install -y pandoc
apt-get update && apt-get install -y parallel retry
ln -sf "$(which python3)" /usr/bin/python ln -sf "$(which python3)" /usr/bin/python
- name: Setup Jupyter Kernel - name: Setup Jupyter Kernel
......
name: Test Disaggregation Mode name: PR Test (PD Router)
on: on:
push: push:
......
...@@ -61,7 +61,7 @@ jobs: ...@@ -61,7 +61,7 @@ jobs:
python3 run_suite.py --suite per-commit python3 run_suite.py --suite per-commit
unit-test-backend-1-gpu: unit-test-backend-1-gpu:
needs: check-changes needs: [check-changes, unit-test-frontend]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -85,7 +85,7 @@ jobs: ...@@ -85,7 +85,7 @@ jobs:
python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10 python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10
unit-test-backend-2-gpu: unit-test-backend-2-gpu:
needs: check-changes needs: [check-changes, unit-test-frontend]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -105,7 +105,7 @@ jobs: ...@@ -105,7 +105,7 @@ jobs:
python3 run_suite.py --suite per-commit-2-gpu python3 run_suite.py --suite per-commit-2-gpu
unit-test-backend-4-gpu: unit-test-backend-4-gpu:
needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -125,7 +125,7 @@ jobs: ...@@ -125,7 +125,7 @@ jobs:
python3 run_suite.py --suite per-commit-4-gpu python3 run_suite.py --suite per-commit-4-gpu
unit-test-backend-8-gpu: unit-test-backend-8-gpu:
needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -245,7 +245,7 @@ jobs: ...@@ -245,7 +245,7 @@ jobs:
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-2-gpu: performance-test-2-gpu:
needs: check-changes needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -282,13 +282,13 @@ jobs: ...@@ -282,13 +282,13 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
- name: Benchmark offline decode throughput (PP=2) - name: Benchmark offline PP decode throughput (PP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
- name: Benchmark offline prefill throughput (PP=2) - name: Benchmark offline PP prefill throughput (PP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
...@@ -318,7 +318,7 @@ jobs: ...@@ -318,7 +318,7 @@ jobs:
python3 test_eval_accuracy_large.py python3 test_eval_accuracy_large.py
accuracy-test-2-gpu: accuracy-test-2-gpu:
needs: check-changes needs: [check-changes, accuracy-test-1-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -341,7 +341,7 @@ jobs: ...@@ -341,7 +341,7 @@ jobs:
python3 test_moe_eval_accuracy_large.py python3 test_moe_eval_accuracy_large.py
unit-test-deepep-4-gpu: unit-test-deepep-4-gpu:
needs: check-changes needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -361,7 +361,7 @@ jobs: ...@@ -361,7 +361,7 @@ jobs:
python3 run_suite.py --suite per-commit-4-gpu-deepep python3 run_suite.py --suite per-commit-4-gpu-deepep
unit-test-deepep-8-gpu: unit-test-deepep-8-gpu:
needs: [check-changes, unit-test-deepep-4-gpu] needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -380,30 +380,8 @@ jobs: ...@@ -380,30 +380,8 @@ jobs:
cd test/srt cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-deepep python3 run_suite.py --suite per-commit-8-gpu-deepep
pr-test-finish:
needs: [
check-changes,
unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu,
unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
]
if: needs.check-changes.outputs.src == 'true'
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
results=(${{ join(needs.*.result, ' ') }})
for result in "${results[@]}"; do
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
echo "Job failed with result: $result"
exit 1
fi
done
echo "All jobs completed successfully"
exit 0
unit-test-backend-8-gpu-b200: unit-test-backend-8-gpu-b200:
needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] needs: [check-changes, unit-test-backend-2-gpu]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false && github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true' needs.check-changes.outputs.src == 'true'
...@@ -416,10 +394,36 @@ jobs: ...@@ -416,10 +394,36 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
MODE_BLACKWELL=1 bash scripts/ci_install_dependency.sh IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh
- name: Run test - name: Run test
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
cd test/srt cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1
pr-test-finish:
needs: [
check-changes,
unit-test-frontend, unit-test-backend-1-gpu,
unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
accuracy-test-1-gpu, accuracy-test-2-gpu,
unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
unit-test-backend-8-gpu-b200,
]
if: needs.check-changes.outputs.src == 'true'
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
results=(${{ join(needs.*.result, ' ') }})
for result in "${results[@]}"; do
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
echo "Job failed with result: $result"
exit 1
fi
done
echo "All jobs completed successfully"
exit 0
...@@ -30,11 +30,12 @@ jobs: ...@@ -30,11 +30,12 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install "vllm==0.10.0" pip install "vllm==0.9.0"
pip install "bitsandbytes>=0.44.0" pip install "bitsandbytes>=0.44.0"
pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
pip install "openai==1.99.1"
- name: Run VLLM dependency tests - name: Run vLLM dependency tests
timeout-minutes: 60 timeout-minutes: 60
run: | run: |
cd test/srt cd test/srt
......
...@@ -20,13 +20,13 @@ ...@@ -20,13 +20,13 @@
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) | | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
## News ## News
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)). - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)). - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)). - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)) - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/)) - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)). - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
<details> <details>
<summary>More</summary> <summary>More</summary>
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412)) - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)). - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)). - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)). - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)). - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)). - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
...@@ -46,10 +47,10 @@ SGLang is a fast serving framework for large language models and vision language ...@@ -46,10 +47,10 @@ SGLang is a fast serving framework for large language models and vision language
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
The core features include: The core features include:
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption. - **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
## Getting Started ## Getting Started
- [Install SGLang](https://docs.sglang.ai/start/install.html) - [Install SGLang](https://docs.sglang.ai/start/install.html)
......
...@@ -189,8 +189,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s ...@@ -189,8 +189,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| Arguments | Description | Defaults | | Arguments | Description | Defaults |
|-----------|-------------|----------| |-----------|-------------|----------|
| `--attention-backend` | Choose the kernels for attention layers. | None | | `--attention-backend` | Choose the kernels for attention layers. | None |
| `decode_attention_backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None | | `--prefill-attention-backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None |
| `prefill_attention_backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None | | `--decode-attention-backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None |
| `--sampling-backend` | Choose the kernels for sampling layers. | None | | `--sampling-backend` | Choose the kernels for sampling layers. | None |
| `--grammar-backend` | Choose the backend for grammar-guided decoding. | None | | `--grammar-backend` | Choose the backend for grammar-guided decoding. | None |
| `--mm-attention-backend` | Set multimodal attention backend. | None | | `--mm-attention-backend` | Set multimodal attention backend. | None |
......
...@@ -5,10 +5,10 @@ SGLang is a fast serving framework for large language models and vision language ...@@ -5,10 +5,10 @@ SGLang is a fast serving framework for large language models and vision language
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
The core features include: The core features include:
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption. - **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
......
...@@ -21,6 +21,7 @@ runtime_common = [ ...@@ -21,6 +21,7 @@ runtime_common = [
"build", "build",
"compressed-tensors", "compressed-tensors",
"datasets", "datasets",
"einops",
"fastapi", "fastapi",
"hf_transfer", "hf_transfer",
"huggingface_hub", "huggingface_hub",
...@@ -29,6 +30,7 @@ runtime_common = [ ...@@ -29,6 +30,7 @@ runtime_common = [
"modelscope", "modelscope",
"msgspec", "msgspec",
"ninja", "ninja",
"openai==1.99.1",
"openai-harmony==0.0.3", "openai-harmony==0.0.3",
"orjson", "orjson",
"outlines==0.1.11", "outlines==0.1.11",
...@@ -48,6 +50,7 @@ runtime_common = [ ...@@ -48,6 +50,7 @@ runtime_common = [
"torchao==0.9.0", "torchao==0.9.0",
"transformers==4.55.0", "transformers==4.55.0",
"timm==1.0.16", "timm==1.0.16",
"tiktoken",
"uvicorn", "uvicorn",
"uvloop", "uvloop",
"xgrammar==0.1.22", "xgrammar==0.1.22",
...@@ -60,7 +63,6 @@ srt = [ ...@@ -60,7 +63,6 @@ srt = [
"torchaudio==2.8.0", "torchaudio==2.8.0",
"torchvision", "torchvision",
"cuda-python", "cuda-python",
"einops",
"flashinfer_python==0.2.10", "flashinfer_python==0.2.10",
] ]
...@@ -71,10 +73,7 @@ blackwell = [ ...@@ -71,10 +73,7 @@ blackwell = [
"torchaudio==2.8.0", "torchaudio==2.8.0",
"torchvision", "torchvision",
"cuda-python", "cuda-python",
"einops",
"flashinfer_python==0.2.10", "flashinfer_python==0.2.10",
"tiktoken",
"openai==1.99.1",
] ]
# HIP (Heterogeneous-computing Interface for Portability) for AMD # HIP (Heterogeneous-computing Interface for Portability) for AMD
...@@ -101,7 +100,7 @@ srt_npu = ["sglang[runtime_common]"] ...@@ -101,7 +100,7 @@ srt_npu = ["sglang[runtime_common]"]
openai = ["openai==1.99.1", "tiktoken"] openai = ["openai==1.99.1", "tiktoken"]
anthropic = ["anthropic>=0.20.0"] anthropic = ["anthropic>=0.20.0"]
litellm = ["litellm>=1.0.0"] litellm = ["litellm>=1.0.0"]
torch_memory_saver = ["torch_memory_saver>=0.0.8"] torch_memory_saver = ["torch_memory_saver==0.0.8"]
decord = ["decord"] decord = ["decord"]
test = [ test = [
"accelerate", "accelerate",
......
...@@ -64,13 +64,12 @@ class ModelConfig: ...@@ -64,13 +64,12 @@ class ModelConfig:
hybrid_kvcache_ratio: Optional[float] = None, hybrid_kvcache_ratio: Optional[float] = None,
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
) -> None: ) -> None:
# Parse args
self.model_path = model_path self.model_path = model_path
self.revision = revision self.revision = revision
self.quantization = quantization self.quantization = quantization
self.model_impl = model_impl self.model_impl = model_impl
# Parse args
self.maybe_pull_model_tokenizer_from_remote() self.maybe_pull_model_tokenizer_from_remote()
self.model_override_args = json.loads(model_override_args) self.model_override_args = json.loads(model_override_args)
kwargs = {} kwargs = {}
...@@ -139,6 +138,7 @@ class ModelConfig: ...@@ -139,6 +138,7 @@ class ModelConfig:
and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM" and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
): ):
self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP" self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
# Check model type # Check model type
self.is_generation = is_generation_model( self.is_generation = is_generation_model(
self.hf_config.architectures, is_embedding self.hf_config.architectures, is_embedding
...@@ -282,12 +282,10 @@ class ModelConfig: ...@@ -282,12 +282,10 @@ class ModelConfig:
# Cache attributes # Cache attributes
self.hf_eos_token_id = self.get_hf_eos_token_id() self.hf_eos_token_id = self.get_hf_eos_token_id()
config = self.hf_config
# multimodal # multimodal
self.image_token_id = getattr(config, "image_token_id", None) or getattr( self.image_token_id = getattr(
config, "image_token_index", None self.hf_config, "image_token_id", None
) ) or getattr(self.hf_config, "image_token_index", None)
@staticmethod @staticmethod
def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs): def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
......
...@@ -9,8 +9,8 @@ logger = logging.getLogger(__name__) ...@@ -9,8 +9,8 @@ logger = logging.getLogger(__name__)
try: try:
from mcp import ClientSession from mcp import ClientSession
except ImportError: except ImportError as e:
logger.warning("Ignoring mcp import error") mcp = e
from openai_harmony import Author, Message, Role, StreamState, TextContent from openai_harmony import Author, Message, Role, StreamState, TextContent
......
from typing import TYPE_CHECKING, Optional, Union from typing import Optional, Union
import torch import torch
......
...@@ -3,7 +3,7 @@ from __future__ import annotations ...@@ -3,7 +3,7 @@ from __future__ import annotations
import builtins import builtins
import inspect import inspect
from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union from typing import TYPE_CHECKING, Dict, Optional, Type
import torch import torch
......
...@@ -37,7 +37,6 @@ from sglang.srt.utils import ( ...@@ -37,7 +37,6 @@ from sglang.srt.utils import (
is_hip, is_hip,
is_port_available, is_port_available,
is_remote_url, is_remote_url,
is_triton_kernels_available,
is_valid_ipv6_address, is_valid_ipv6_address,
nullable_str, nullable_str,
) )
...@@ -109,7 +108,7 @@ class ServerArgs: ...@@ -109,7 +108,7 @@ class ServerArgs:
log_level: str = "info" log_level: str = "info"
log_level_http: Optional[str] = None log_level_http: Optional[str] = None
log_requests: bool = False log_requests: bool = False
log_requests_level: int = 0 log_requests_level: int = 2
crash_dump_folder: Optional[str] = None crash_dump_folder: Optional[str] = None
show_time_cost: bool = False show_time_cost: bool = False
enable_metrics: bool = False enable_metrics: bool = False
...@@ -131,6 +130,7 @@ class ServerArgs: ...@@ -131,6 +130,7 @@ class ServerArgs:
enable_cache_report: bool = False enable_cache_report: bool = False
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
tool_call_parser: Optional[str] = None tool_call_parser: Optional[str] = None
tool_server: Optional[str] = None
# Data parallelism # Data parallelism
dp_size: int = 1 dp_size: int = 1
...@@ -278,15 +278,11 @@ class ServerArgs: ...@@ -278,15 +278,11 @@ class ServerArgs:
enable_pdmux: bool = False enable_pdmux: bool = False
sm_group_num: int = 3 sm_group_num: int = 3
# For tool server
tool_server: Optional[str] = None
# Deprecated arguments # Deprecated arguments
enable_ep_moe: bool = False enable_ep_moe: bool = False
enable_deepep_moe: bool = False enable_deepep_moe: bool = False
def __post_init__(self): def __post_init__(self):
# Check deprecated arguments # Check deprecated arguments
def print_deprecated_warning(message: str): def print_deprecated_warning(message: str):
logger.warning(f"\033[33m{message}\033[0m") logger.warning(f"\033[33m{message}\033[0m")
...@@ -392,6 +388,9 @@ class ServerArgs: ...@@ -392,6 +388,9 @@ class ServerArgs:
self.attention_backend = "torch_native" self.attention_backend = "torch_native"
self.sampling_backend = "pytorch" self.sampling_backend = "pytorch"
# Model-specific adjustments
self.model_specific_adjustments()
# Set kernel backends # Set kernel backends
if self.device == "cpu": if self.device == "cpu":
if self.attention_backend is None: if self.attention_backend is None:
...@@ -470,55 +469,9 @@ class ServerArgs: ...@@ -470,55 +469,9 @@ class ServerArgs:
"trtllm_mha backend does not support speculative decoding yet." "trtllm_mha backend does not support speculative decoding yet."
) )
model_arch = self.get_hf_config().architectures[0]
if model_arch in ["GptOssForCausalLM"]:
if self.attention_backend is None:
# default is triton, but we could have trtllm_mha as an option
self.attention_backend = "triton"
assert (
self.attention_backend == "trtllm_mha"
or self.attention_backend == "triton"
)
quantization_config = getattr(
self.get_hf_config(), "quantization_config", None
)
is_mxfp4_quant_format = (
quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4"
)
if is_sm100_supported() and is_mxfp4_quant_format:
self.enable_flashinfer_mxfp4_moe = True
self.enable_triton_kernel_moe = False
logger.info(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
)
else:
if self.enable_triton_kernel_moe:
assert (
self.ep_size == 1
), "Triton kernel MoE is only supported when ep_size == 1"
if not self.enable_triton_kernel_moe and self.ep_size == 1:
self.enable_triton_kernel_moe = True
logger.info(
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
)
self.disable_hybrid_swa_memory = True
if is_mxfp4_quant_format:
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
if self.attention_backend == "dual_chunk_flash_attn": if self.attention_backend == "dual_chunk_flash_attn":
logger.warning( logger.warning(
"Mixed chunk is disabled because of using dual chunk flash attention backend" "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
)
logger.warning(
"Radix cache is disabled because of using dual chunk flash attention backend"
)
logger.warning(
"Cuda graph is disabled because of using dual chunk flash attention backend"
) )
self.enable_mixed_chunk = False self.enable_mixed_chunk = False
self.disable_cuda_graph = True self.disable_cuda_graph = True
...@@ -583,7 +536,7 @@ class ServerArgs: ...@@ -583,7 +536,7 @@ class ServerArgs:
if self.enable_eplb and (self.expert_distribution_recorder_mode is None): if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
self.expert_distribution_recorder_mode = "stat" self.expert_distribution_recorder_mode = "stat"
logger.info( logger.warning(
"EPLB is enabled. The expert_distribution_recorder_mode is automatically set." "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
) )
...@@ -591,9 +544,6 @@ class ServerArgs: ...@@ -591,9 +544,6 @@ class ServerArgs:
self.ep_dispatch_algorithm is None self.ep_dispatch_algorithm is None
): ):
self.ep_dispatch_algorithm = "static" self.ep_dispatch_algorithm = "static"
logger.info(
"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
)
if self.enable_eplb: if self.enable_eplb:
assert self.ep_size > 1 or self.moe_a2a_backend is not None assert self.ep_size > 1 or self.moe_a2a_backend is not None
...@@ -1112,7 +1062,7 @@ class ServerArgs: ...@@ -1112,7 +1062,7 @@ class ServerArgs:
parser.add_argument( parser.add_argument(
"--log-requests-level", "--log-requests-level",
type=int, type=int,
default=0, default=ServerArgs.log_requests_level,
help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.", help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
choices=[0, 1, 2, 3], choices=[0, 1, 2, 3],
) )
...@@ -1245,6 +1195,12 @@ class ServerArgs: ...@@ -1245,6 +1195,12 @@ class ServerArgs:
default=ServerArgs.tool_call_parser, default=ServerArgs.tool_call_parser,
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.", help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
) )
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
# Data parallelism # Data parallelism
parser.add_argument( parser.add_argument(
...@@ -1344,55 +1300,41 @@ class ServerArgs: ...@@ -1344,55 +1300,41 @@ class ServerArgs:
) )
# Kernel backend # Kernel backend
ATTN_BACKENDS = [
"aiter",
"cutlass_mla",
"fa3",
"flashinfer",
"flashmla",
"intel_amx",
"torch_native",
"ascend",
"triton",
"trtllm_mla",
"trtllm_mha",
"dual_chunk_flash_attn",
]
parser.add_argument( parser.add_argument(
"--attention-backend", "--attention-backend",
type=str, type=str,
choices=[ choices=ATTN_BACKENDS,
"aiter",
"cutlass_mla",
"fa3",
"flashinfer",
"flashmla",
"intel_amx",
"torch_native",
"ascend",
"triton",
"trtllm_mla",
"trtllm_mha",
"dual_chunk_flash_attn",
],
default=ServerArgs.attention_backend, default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.", help="Choose the kernels for attention layers.",
) )
parser.add_argument(
"--decode-attention-backend",
type=str,
choices=[
"flashinfer",
"triton",
"torch_native",
"fa3",
"flashmla",
"cutlass_mla",
],
default=ServerArgs.decode_attention_backend,
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
)
parser.add_argument( parser.add_argument(
"--prefill-attention-backend", "--prefill-attention-backend",
type=str, type=str,
choices=[ choices=ATTN_BACKENDS,
"flashinfer",
"triton",
"torch_native",
"fa3",
"flashmla",
"cutlass_mla",
],
default=ServerArgs.prefill_attention_backend, default=ServerArgs.prefill_attention_backend,
help="Choose the kernels for prefill attention layers (have priority over --attention-backend).", help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
) )
parser.add_argument(
"--decode-attention-backend",
type=str,
choices=ATTN_BACKENDS,
default=ServerArgs.decode_attention_backend,
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
)
parser.add_argument( parser.add_argument(
"--sampling-backend", "--sampling-backend",
type=str, type=str,
...@@ -1612,7 +1554,6 @@ class ServerArgs: ...@@ -1612,7 +1554,6 @@ class ServerArgs:
default=ServerArgs.hicache_mem_layout, default=ServerArgs.hicache_mem_layout,
help="The layout of host memory pool for hierarchical cache.", help="The layout of host memory pool for hierarchical cache.",
) )
parser.add_argument( parser.add_argument(
"--hicache-storage-backend", "--hicache-storage-backend",
type=str, type=str,
...@@ -1985,14 +1926,6 @@ class ServerArgs: ...@@ -1985,14 +1926,6 @@ class ServerArgs:
help="Disable mmap while loading weight using safetensors.", help="Disable mmap while loading weight using safetensors.",
) )
# For tool server
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
# Deprecated arguments # Deprecated arguments
parser.add_argument( parser.add_argument(
"--enable-ep-moe", "--enable-ep-moe",
...@@ -2056,25 +1989,6 @@ class ServerArgs: ...@@ -2056,25 +1989,6 @@ class ServerArgs:
None, None,
}, "moe_dense_tp_size only support 1 and None currently" }, "moe_dense_tp_size only support 1 and None currently"
# Check model architecture
model_arch = self.get_hf_config().architectures[0]
if "Llama4" in model_arch:
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
if model_arch in [
"Gemma2ForCausalLM",
"Gemma3ForCausalLM",
"Gemma3ForConditionalGeneration",
"Gemma3nForCausalLM",
"Gemma3nForConditionalGeneration",
]:
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
logger.warning(
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
)
self.disable_hybrid_swa_memory = True
# Check LoRA # Check LoRA
self.check_lora_server_args() self.check_lora_server_args()
...@@ -2100,7 +2014,7 @@ class ServerArgs: ...@@ -2100,7 +2014,7 @@ class ServerArgs:
if self.lora_paths: if self.lora_paths:
if self.enable_lora is None: if self.enable_lora is None:
self.enable_lora = True self.enable_lora = True
logger.info( logger.warning(
"--enable-lora is set to True because --lora-paths is provided." "--enable-lora is set to True because --lora-paths is provided."
) )
elif self.enable_lora is False: elif self.enable_lora is False:
...@@ -2172,6 +2086,58 @@ class ServerArgs: ...@@ -2172,6 +2086,58 @@ class ServerArgs:
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}" f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
) )
def model_specific_adjustments(self):
hf_config = self.get_hf_config()
model_arch = hf_config.architectures[0]
if model_arch in ["GptOssForCausalLM"]:
if self.attention_backend is None:
self.attention_backend = "triton"
assert self.attention_backend in [
"triton",
"trtllm_mha",
], f"GptOssForCausalLM requires 'triton' or 'trtllm_mha' attention backend, but got {self.attention_backend}"
quantization_config = getattr(hf_config, "quantization_config", None)
is_mxfp4_quant_format = (
quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4"
)
if is_sm100_supported() and is_mxfp4_quant_format:
self.enable_flashinfer_mxfp4_moe = True
self.enable_triton_kernel_moe = False
logger.warning(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
)
else:
if self.enable_triton_kernel_moe:
assert (
self.ep_size == 1
), "Triton kernel MoE is only supported when ep_size == 1"
if not self.enable_triton_kernel_moe and self.ep_size == 1:
self.enable_triton_kernel_moe = True
logger.warning(
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
)
self.disable_hybrid_swa_memory = True
if is_mxfp4_quant_format:
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
elif "Llama4" in model_arch:
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
elif model_arch in [
"Gemma2ForCausalLM",
"Gemma3ForCausalLM",
"Gemma3ForConditionalGeneration",
"Gemma3nForCausalLM",
"Gemma3nForConditionalGeneration",
]:
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
logger.warning(
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
)
self.disable_hybrid_swa_memory = True
def adjust_mem_fraction_for_vlm(self, model_config): def adjust_mem_fraction_for_vlm(self, model_config):
vision_config = getattr(model_config.hf_config, "vision_config", None) vision_config = getattr(model_config.hf_config, "vision_config", None)
if vision_config is None: if vision_config is None:
......
...@@ -2,68 +2,71 @@ ...@@ -2,68 +2,71 @@
# Install the dependency in CI. # Install the dependency in CI.
set -euxo pipefail set -euxo pipefail
MODE_BLACKWELL=${MODE_BLACKWELL:-0} IS_BLACKWELL=${IS_BLACKWELL:-0}
CU_VERSION="cu126" if [ "$IS_BLACKWELL" = "1" ]; then
if [ "$MODE_BLACKWELL" = "1" ]; then
CU_VERSION="cu129" CU_VERSION="cu129"
else
CU_VERSION="cu126"
fi fi
# Kill existing processes # Kill existing processes
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
bash "${SCRIPT_DIR}/killall_sglang.sh" bash "${SCRIPT_DIR}/killall_sglang.sh"
if [ "$MODE_BLACKWELL" = "1" ]; then # Install apt packages
apt-get install -y git libnuma-dev apt install -y git libnuma-dev
fi
# Install uv
if [ "$IS_BLACKWELL" = "1" ]; then
# The blackwell CI runner has some issues with pip and uv,
# so we can only use pip with `--break-system-packages`
PIP_CMD="pip"
PIP_INSTALL_SUFFIX="--break-system-packages"
# Update pip # Clean up existing installations
if [ "$MODE_BLACKWELL" != "1" ]; then $PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true
else
# In normal cases, we use uv, which is much faster than pip.
pip install --upgrade pip pip install --upgrade pip
fi pip install uv
export UV_SYSTEM_PYTHON=true
# Clean up existing installations PIP_CMD="uv pip"
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm --break-system-packages || true PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match"
pip cache purge || true
rm -rf /root/.cache/flashinfer # Clean up existing installations
# TODO handle other python versions $PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true
rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer* fi
rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
# Install the main package # Install the main package
pip install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} --break-system-packages $PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
if [ "$MODE_BLACKWELL" = "1" ]; then if [ "$IS_BLACKWELL" = "1" ]; then
# TODO auto determine sgl-kernel version # TODO auto determine sgl-kernel version
SGL_KERNEL_VERSION=0.3.2 SGL_KERNEL_VERSION=0.3.2
pip3 install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --break-system-packages $PIP_CMD install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
fi fi
# Show current packages # Show current packages
pip list $PIP_CMD list
# Install additional dependencies # Install additional dependencies
pip install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 --break-system-packages $PIP_CMD install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 py-spy huggingface_hub[hf_xet] $PIP_INSTALL_SUFFIX
if [ "$MODE_BLACKWELL" != "1" ]; then if [ "$IS_BLACKWELL" != "1" ]; then
# For lmms_evals evaluating MMMU # For lmms_evals evaluating MMMU
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
pip install -e lmms-eval/ --break-system-packages $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
fi
# Install FlashMLA for attention backend tests
# pip install git+https://github.com/deepseek-ai/FlashMLA.git --break-system-packages
# Install hf_xet
pip install huggingface_hub[hf_xet] --break-system-packages
if [ "$MODE_BLACKWELL" != "1" ]; then
# Install xformers # Install xformers
pip install -U xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps --force-reinstall --break-system-packages $PIP_CMD install xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps $PIP_INSTALL_SUFFIX
fi fi
# To help dumping traces when timeout occurred # Install FlashMLA for attention backend tests
pip install py-spy --break-system-packages # $PIP_CMD install git+https://github.com/deepseek-ai/FlashMLA.git $PIP_INSTALL_SUFFIX
# Show current packages # Show current packages
pip list $PIP_CMD list
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
import json
import re
import sys
def clean_json_file(input_file, output_file):
try:
# Open the input file with 'replace' option for handling bad characters
with open(input_file, "r", encoding="utf-8", errors="replace") as f:
data = f.read()
# Replace bad characters (represented by '�' after decoding) with a space
cleaned_data = data.replace("�", " ")
# Remove control characters (e.g., ASCII control characters like \x00 to \x1F)
# These can cause issues in JSON parsing.
cleaned_data = re.sub(r"[\x00-\x1F]+", " ", cleaned_data)
# Parse cleaned data as JSON
json_data = json.loads(cleaned_data)
# Write the cleaned JSON to a new output file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False, indent=4)
print(f"Cleaned JSON file has been saved to {output_file}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
assert len(sys.argv) > 1, "please give the input file path"
if len(sys.argv) == 3:
input_file = sys.argv[1]
output_file = sys.argv[2]
else:
input_file = output_file = sys.argv[1]
clean_json_file(input_file, output_file)
...@@ -27,7 +27,6 @@ else ...@@ -27,7 +27,6 @@ else
lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null
fi fi
# Show GPU status after clean up # Show GPU status after clean up
nvidia-smi nvidia-smi
fi fi
...@@ -8,8 +8,6 @@ suites = { ...@@ -8,8 +8,6 @@ suites = {
TestFile("test_srt_backend.py"), TestFile("test_srt_backend.py"),
# Skip this due to some OPENAI_API_KEY issues # Skip this due to some OPENAI_API_KEY issues
# "test_openai_backend.py", # "test_openai_backend.py",
TestFile("test_separate_reasoning.py"),
TestFile("test_separate_reasoning_execution.py"),
], ],
} }
......
...@@ -38,7 +38,6 @@ suites = { ...@@ -38,7 +38,6 @@ suites = {
TestFile("openai_server/basic/test_serving_embedding.py", 10), TestFile("openai_server/basic/test_serving_embedding.py", 10),
TestFile("openai_server/basic/test_openai_embedding.py", 141), TestFile("openai_server/basic/test_openai_embedding.py", 141),
TestFile("openai_server/basic/test_openai_server.py", 149), TestFile("openai_server/basic/test_openai_server.py", 149),
TestFile("openai_server/features/test_cache_report.py", 100),
TestFile("openai_server/features/test_enable_thinking.py", 70), TestFile("openai_server/features/test_enable_thinking.py", 70),
TestFile("openai_server/features/test_json_constrained.py", 98), TestFile("openai_server/features/test_json_constrained.py", 98),
TestFile("openai_server/features/test_json_mode.py", 90), TestFile("openai_server/features/test_json_mode.py", 90),
...@@ -103,7 +102,6 @@ suites = { ...@@ -103,7 +102,6 @@ suites = {
TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_utils_update_weights.py", 48), TestFile("test_utils_update_weights.py", 48),
TestFile("test_vertex_endpoint.py", 31),
TestFile("test_vision_chunked_prefill.py", 175), TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_input_format.py", 300), TestFile("test_vlm_input_format.py", 300),
TestFile("test_vision_openai_server_a.py", 584), TestFile("test_vision_openai_server_a.py", 584),
...@@ -167,7 +165,6 @@ suites = { ...@@ -167,7 +165,6 @@ suites = {
TestFile("models/lora/test_lora_tp.py", 116), TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73), TestFile("test_data_parallelism.py", 73),
TestFile("test_dp_attention.py", 277), TestFile("test_dp_attention.py", 277),
TestFile("test_mla_tp.py", 170),
TestFile("test_patch_torch.py", 19), TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103), TestFile("test_update_weights_from_distributed.py", 103),
TestFile("test_release_memory_occupation.py", 127), TestFile("test_release_memory_occupation.py", 127),
...@@ -175,7 +172,6 @@ suites = { ...@@ -175,7 +172,6 @@ suites = {
"per-commit-2-gpu-amd": [ "per-commit-2-gpu-amd": [
TestFile("models/lora/test_lora_tp.py", 116), TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73), TestFile("test_data_parallelism.py", 73),
TestFile("test_mla_tp.py", 170),
TestFile("test_patch_torch.py", 19), TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103), TestFile("test_update_weights_from_distributed.py", 103),
], ],
......
...@@ -15,7 +15,7 @@ from sglang.test.test_utils import ( ...@@ -15,7 +15,7 @@ from sglang.test.test_utils import (
TEST_MODEL_MATRIX = { TEST_MODEL_MATRIX = {
"Qwen/Qwen2.5-7B-Instruct": { "Qwen/Qwen2.5-7B-Instruct": {
"accuracy": 0.85, "accuracy": 0.84,
"latency": 150, "latency": 150,
"output_throughput": 30, "output_throughput": 30,
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment