chore(sglang): Move examples/sglang to components/backends/sglang (#2046)

d65ce1b0 · Graham King · GitHub · 78826932 · d65ce1b0 · d65ce1b0
Unverified Commit d65ce1b0 authored Jul 22, 2025 by Graham King Committed by GitHub Jul 22, 2025
20 changed files
--- a/examples/sglang/README.md
+++ b/examples/sglang/README.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->

-# LLM Deployment Examples using SGLang
+# LLM Deployment using SGLang

-This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using SGLang. SGLang internally uses ZMQ to communicate between the ingress and the engine processes. For Dynamo, we leverage the runtime to communicate directly with the engine processes and handle ingress and pre/post processing on our end.
+This directory contains an SGLang component for Dynamo and reference implementations for deploying Large Language Models (LLMs) in various configurations using SGLang. SGLang internally uses ZMQ to communicate between the ingress and the engine processes. For Dynamo, we leverage the runtime to communicate directly with the engine processes and handle ingress and pre/post processing on our end.

 ## Use the Latest Release

@@ -100,20 +88,20 @@ docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.3.2
 ### Aggregated Serving

 ```bash
-cd $DYNAMO_ROOT/examples/sglang
+cd $DYNAMO_ROOT/components/backends/sglang
 ./launch/agg.sh
 ```

 ### Aggregated Serving with KV Routing

 > [!NOTE]
-> The current implementation of `examples/sglang/components/worker.py` publishes _placeholder_ engine metrics to keep the Dynamo KV-router happy. Real-time metrics will be surfaced directly from the SGLang engine once the following pull requests are merged:
+> The current implementation of `components/backends/sglang/src/dynamo/sglang/worker/main.py` publishes _placeholder_ engine metrics to keep the Dynamo KV-router happy. Real-time metrics will be surfaced directly from the SGLang engine once the following pull requests are merged:
 > • Dynamo: [ai-dynamo/dynamo #1465](https://github.com/ai-dynamo/dynamo/pull/1465) – _feat: receive kvmetrics from sglang scheduler_.
 >
-> After these are in, the TODOs in `worker.py` will be resolved and the placeholder logic removed.
+> After these are in, the TODOs in `main.py` will be resolved and the placeholder logic removed.

 ```bash
-cd $DYNAMO_ROOT/examples/sglang
+cd $DYNAMO_ROOT/components/backends/sglang
 ./launch/agg_router.sh
 ```

@@ -137,7 +125,7 @@ Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead
 > Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922)

 ```bash
-cd $DYNAMO_ROOT/examples/sglang
+cd $DYNAMO_ROOT/components/backends/sglang
 ./launch/disagg.sh
 ```

@@ -147,7 +135,7 @@ You can use this configuration to test out disaggregated serving with dp attenti

 ```bash
 # note this will require 4 GPUs
-cd $DYNAMO_ROOT/examples/sglang
+cd $DYNAMO_ROOT/components/backends/sglang
 ./launch/disagg_dp_attn.sh
 ```


--- a/examples/sglang/configs/deepseek_r1/wideep/deepep.json
+++ b/examples/sglang/configs/deepseek_r1/wideep/deepep.json
--- a/examples/sglang/docs/dsr1-wideep-h100.md
+++ b/examples/sglang/docs/dsr1-wideep-h100.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->

 # Running DeepSeek-R1 Disaggregated with WideEP on H100s
@@ -59,7 +47,7 @@ docker run \
    dynamo-wideep:latest
 ```

-In each container, you should be in the `/sgl-workspace/dynamo/examples/sglang` directory.
+In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory.

 4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.

@@ -75,7 +63,7 @@ dynamo run in=http out=dyn &
 # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
 python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --skip-tokenizer-init \
@@ -110,7 +98,7 @@ On the other prefill node (since this example has 4 total prefill nodes), run th
 7. Run the decode worker on the head decode node

 ```bash
-python3 components/decode_worker.py \
+python3 -m dynamo.sglang.decode_worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --skip-tokenizer-init \
@@ -184,7 +172,7 @@ We provide a script that generates a JSONL file of the ShareGPT dataset and then
 Example usage:
 ```bash
 # generate data
-python3 utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1
+python3 src/dynamo/sglang/utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1
 # if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script)
 curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache
 # run benchmark

--- a/examples/sglang/docs/multinode-examples.md
+++ b/examples/sglang/docs/multinode-examples.md
@@ -11,7 +11,7 @@ SGLang allows you to deploy multi-node sized models by adding in the `dist-init-

 **Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script.
 ```bash
-./utils/gen_env_vars.sh
+./components/backends/sglang/src/dynamo/sglang/utils/gen_env_vars.sh
 ```

 **Step 2**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16:
@@ -21,7 +21,7 @@ Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker
 # run ingress
 dynamo run in=http out=dyn &
 # run prefill worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --tp 16 \
@@ -40,7 +40,7 @@ python3 components/worker.py \

 Node 2: Run the remaining 8 shards of the prefill worker
 ```bash
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --tp 16 \
@@ -59,7 +59,7 @@ python3 components/worker.py \

 Node 3: Run the first 8 shards of the decode worker
 ```bash
-python3 components/decode_worker.py \
+python3 -m dynamo.sglang.decode_worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --tp 16 \
@@ -78,7 +78,7 @@ python3 components/decode_worker.py \

 Node 4: Run the remaining 8 shards of the decode worker
 ```bash
-python3 components/decode_worker.py \
+python3 -m dynamo.sglang.decode_worker \
  --model-path /model/ \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --tp 16 \

--- a/examples/sglang/docs/sgl-http-server.md
+++ b/examples/sglang/docs/sgl-http-server.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->

 # Supporting SGLang's native endpoints via HTTP Server
@@ -86,7 +74,7 @@ The server accepts the following command-line arguments:

 Start the server:
 ```bash
-python sgl_http_server.py --port 9001 --namespace dynamo
+python src/dynamo/sglang/utils/sgl_http_server.py --port 9001 --namespace dynamo
 ```

 The server will automatically discover all SGLang components in the specified namespace and provide HTTP endpoints for managing them.
--- a/examples/sglang/launch/agg.sh
+++ b/examples/sglang/launch/agg.sh
@@ -12,14 +12,14 @@ cleanup() {
 trap cleanup EXIT INT TERM

 # run clear_namespace
-python3 utils/clear_namespace.py --namespace dynamo
+python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo

 # run ingress
 dynamo run in=http out=dyn --http-port=8000 &
 DYNAMO_PID=$!

 # run worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --page-size 16 \

--- a/examples/sglang/launch/agg_router.sh
+++ b/examples/sglang/launch/agg_router.sh
@@ -12,14 +12,14 @@ cleanup() {
 trap cleanup EXIT INT TERM

 # run clear_namespace
-python3 utils/clear_namespace.py --namespace dynamo
+python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo

 # run ingress
 dynamo run in=http out=dyn --router-mode kv --http-port=8000 &
 DYNAMO_PID=$!

 # run worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --page-size 16 \

--- a/examples/sglang/launch/disagg.sh
+++ b/examples/sglang/launch/disagg.sh
@@ -12,14 +12,14 @@ cleanup() {
 trap cleanup EXIT INT TERM

 # run clear_namespace
-python3 utils/clear_namespace.py --namespace dynamo
+python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo

 # run ingress
 dynamo run in=http out=dyn --http-port=8000 &
 DYNAMO_PID=$!

 # run prefill worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --page-size 16 \
@@ -31,7 +31,7 @@ python3 components/worker.py \
 PREFILL_PID=$!

 # run decode worker
-CUDA_VISIBLE_DEVICES=1 python3 components/decode_worker.py \
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang.decode_worker \
  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
  --page-size 16 \

--- a/examples/sglang/launch/disagg_dp_attn.sh
+++ b/examples/sglang/launch/disagg_dp_attn.sh
@@ -12,14 +12,14 @@ cleanup() {
 trap cleanup EXIT INT TERM

 # run clear_namespace
-python3 utils/clear_namespace.py --namespace dynamo
+python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo

 # run ingress
 dynamo run in=http out=dyn --http-port=8000 &
 DYNAMO_PID=$!

 # run prefill worker
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --model-path silence09/DeepSeek-R1-Small-2layers \
  --served-model-name silence09/DeepSeek-R1-Small-2layers \
  --tp 2 \
@@ -33,7 +33,7 @@ python3 components/worker.py \
 PREFILL_PID=$!

 # run decode worker
-CUDA_VISIBLE_DEVICES=2,3 python3 components/decode_worker.py \
+CUDA_VISIBLE_DEVICES=2,3 python3 dynamo.sglang.decode_worker \
  --model-path silence09/DeepSeek-R1-Small-2layers \
  --served-model-name silence09/DeepSeek-R1-Small-2layers \
  --tp 2 \

--- a/components/backends/sglang/requirements.txt
+++ b/components/backends/sglang/requirements.txt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+sglang[all]>=0.4.9.post2
+uvloop
--- a/examples/sglang/slurm_jobs/.gitignore
+++ b/examples/sglang/slurm_jobs/.gitignore
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
--- a/examples/sglang/slurm_jobs/scripts/monitor_gpu_utilization.sh
+++ b/examples/sglang/slurm_jobs/scripts/monitor_gpu_utilization.sh
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

 """
 Worker setup script for Slurm nodes.
@@ -230,7 +218,7 @@ def setup_prefill_node(
    # NOTE: This implements the example in examples/sglang/dsr1-wideep.md
    # For other examples, the command might have to be modified.
    dynamo_cmd = (
-        f"python3 components/worker.py "
+        f"python3 -m dynamo.sglang.worker "
        "--model-path /model/ "
        "--served-model-name deepseek-ai/DeepSeek-R1 "
        "--skip-tokenizer-init "
@@ -278,7 +266,7 @@ def setup_decode_node(
        raise RuntimeError("Failed to connect to etcd")

    dynamo_cmd = (
-        "python3 components/decode_worker.py "
+        "python3 -m dynamo.sglang.decode_worker "
        "--model-path /model/ "
        "--served-model-name deepseek-ai/DeepSeek-R1 "
        "--skip-tokenizer-init "

--- a/examples/sglang/slurm_jobs/submit_job_script.py
+++ b/examples/sglang/slurm_jobs/submit_job_script.py
--- a/examples/sglang/__init__.py
+++ b/examples/sglang/__init__.py
--- a/examples/sglang/components/__init__.py
+++ b/examples/sglang/components/__init__.py
--- a/components/backends/sglang/src/dynamo/sglang/decode_worker/__main__.py
+++ b/components/backends/sglang/src/dynamo/sglang/decode_worker/__main__.py
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+
+from dynamo.sglang.decode_worker.main import main
+
+if __name__ == "__main__":
+    main()
--- a/examples/sglang/components/decode_worker.py
+++ b/examples/sglang/components/decode_worker.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

 from __future__ import annotations

@@ -24,10 +12,10 @@ import msgspec
 import sglang as sgl
 import uvloop
 from sglang.srt.server_args import ServerArgs
-from utils.sgl_utils import parse_sglang_args_inc

 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
+from dynamo.sglang.utils.sgl_utils import parse_sglang_args_inc

 configure_dynamo_logging()

@@ -106,6 +94,10 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
    await asyncio.gather(*tasks)


-if __name__ == "__main__":
+def main():
    uvloop.install()
    asyncio.run(worker())
+
+
+if __name__ == "__main__":
+    main()