fix: readme instructions for worker running (#2266)

053ac33e · ishandhanani · GitHub · dbb4caaf · 053ac33e · 053ac33e
Unverified Commit 053ac33e authored Aug 04, 2025 by ishandhanani Committed by GitHub Aug 04, 2025
3 changed files
--- a/README.md
+++ b/README.md
@@ -115,11 +115,11 @@ Dynamo provides a simple way to spin up a local set of inference components incl

 ```
 # Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router:
-python -m dynamo.frontend [--http-port 8080]
+python -m dynamo.frontend --http-port 8080

 # Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these,
 # both for the same model and for multiple models. The frontend node will discover them.
-python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+python -m dynamo.sglang.worker --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --skip-tokenizer-init
 ```

 #### Send a Request

--- a/components/backends/sglang/docs/dsr1-wideep-gb200.md
+++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -67,8 +67,6 @@ docker run \
 ```bash
 # run ingress
 python3 -m dynamo.frontend --http-port=8000 &
-# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
-python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
 SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
 MC_TE_METRIC=true \
@@ -82,7 +80,7 @@ NCCL_CUMEM_ENABLE=1 \
 SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
 SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
 PYTHONUNBUFFERED=1 \
-python3 components/worker.py \
+python3 -m dynamo.sglang.worker \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --model-path /model/ \
  --skip-tokenizer-init \
@@ -90,7 +88,6 @@ python3 components/worker.py \
  --disaggregation-mode prefill \
  --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \
  --disaggregation-bootstrap-port 30001 \
-  --disaggregation-transfer-backend nixl \
  --nnodes 2 \
  --node-rank 0 \
  --tp-size 8 \
@@ -134,7 +131,7 @@ NCCL_CUMEM_ENABLE=1 \
 SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
 SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
 PYTHONUNBUFFERED=1 \
-python3 components/decode_worker.py \
+python3 -m dynamo.sglang.decode_worker \
  --served-model-name deepseek-ai/DeepSeek-R1 \
  --model-path /model/ \
  --skip-tokenizer-init \

--- a/components/backends/sglang/slurm_jobs/scripts/gb200.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/gb200.sh
@@ -94,7 +94,6 @@ if [ "$mode" = "prefill" ]; then
            --disaggregation-mode prefill \
            --dist-init-addr "$HOST_IP:$PORT" \
            --disaggregation-bootstrap-port 30001 \
-            --disaggregation-transfer-backend nixl \
            --nnodes "$TOTAL_NODES" \
            --node-rank "$RANK" \
            --tp-size "$TOTAL_GPUS" \