Unverified Commit 811b10a6 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

chore: bump sglang version (#1219)

parent c12f61a6
......@@ -136,16 +136,14 @@ RUN if [ "$ARCH" = "arm64" ]; then \
fi
# Install sglang
# TODO: NIXL transfer is currently broken as of https://github.com/sgl-project/sglang/commit/7513558074adc4c4015b68e2ae7cf719d3401d5d
# Once this is fixed we will have to install from that commit until a new post is released
ARG SGLANG_COMMIT="4d643f6c7a291c86de64a9e52eca526b2d99775d"
# Once either 0.4.6post6 or 0.4.7 is released, we can switch back to using the published version
# This commit references a fix for DP attention and NIXL https://github.com/sgl-project/sglang/pull/6473
ARG SGLANG_COMMIT="e806f708c954020bda7d1cc98035a44fd6a4eb96"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone https://github.com/sgl-project/sglang.git && \
cd sglang && \
git checkout ${SGLANG_COMMIT} && \
uv pip install -e "python[all]" && \
cd .. && \
rm -rf sglang
uv pip install -e "python[all]"
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
......
......@@ -77,3 +77,13 @@ Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead
cd /workspace/examples/sglang
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
```
##### Disaggregated with MoE and DP attention
SGLang also supports DP attention for MoE models. We provide an example config for this in `configs/disagg-dp-attention.yaml` which is based on the [DeepSeek-R1-Small-2layers](https://huggingface.co/silence09/DeepSeek-R1-Small-2layers) model. You can use this configuration to test out disaggregated serving on a single node before scaling to the full DeepSeek-R1 model across multiple nodes.
```bash
# note this will require 4 GPUs
cd /workspace/examples/sglang
dynamo serve graphs.disagg:Frontend -f ./configs/disagg-dp-attention.yaml
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.SGLangWorker.generate
port: 8000
SGLangWorker:
model-path: silence09/DeepSeek-R1-Small-2layers
served-model-name: silence09/DeepSeek-R1-Small-2layers
tp: 2
dp-size: 2
enable-dp-attention: true
trust-remote-code: true
skip-tokenizer-init: true
disaggregation-mode: prefill
disaggregation-transfer-backend: nixl
port: 30000
ServiceArgs:
workers: 1
resources:
gpu: 2
SGLangDecodeWorker:
model-path: silence09/DeepSeek-R1-Small-2layers
served-model-name: silence09/DeepSeek-R1-Small-2layers
tp: 2
dp-size: 2
enable-dp-attention: true
trust-remote-code: true
skip-tokenizer-init: true
disaggregation-mode: decode
disaggregation-transfer-backend: nixl
# SGLang requires a port delta between prefill and decode workers when using enable-dp-attention
port: 31000
ServiceArgs:
workers: 1
resources:
gpu: 2
\ No newline at end of file
......@@ -67,7 +67,7 @@ vllm = [
]
sglang = [
"sglang[all]@git+https://github.com/sgl-project/sglang@4d643f6c7a291c86de64a9e52eca526b2d99775d#subdirectory=python"
"sglang[all]@git+https://github.com/sgl-project/sglang@e806f708c954020bda7d1cc98035a44fd6a4eb96#subdirectory=python"
]
[project.scripts]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment