docs: Add instructions for multi-node disaggregated deployment

Signed-off-by: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

docs: Add instructions for multi-node disaggregated deployment
Signed-off-by: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
194abde3 · ptarasiewiczNV · GitHub · a8c5637f · 194abde3 · 194abde3
Commit 194abde3 authored Jan 24, 2025 by ptarasiewiczNV Committed by GitHub Jan 24, 2025
Showing with 166 additions and 2 deletions

examples/llm/vllm/README.md examples/llm/vllm/README.md +25 -2

examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh ...lm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh +141 -0

No files found.
--- a/examples/llm/vllm/README.md
+++ b/examples/llm/vllm/README.md
@@ -319,7 +319,24 @@ In the commands above, we used the FP8 variant `neuralmagic/Meta-Llama-3.1-8B-In
 ```
-## 7. Known Issues & Limitations
+## 7. Multi-node Deployment
+To deploy the solution in a multi-node environment please refer to [deploy_llama_8b_disaggregated_multinode.sh](examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh) script. On a head node run NATS server, API server and context worker with
+```
+./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh context --head-url <head url>
+```
+On the second node run the generate worker
+```
+./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh generate --head-url <head url>
+```
+The example script is set by default to launch one context worker with TP 1 on the head node and one generate worker with TP 1 on the secondary node. This can be changed for other configurations - see the script for details.
+## 8. Known Issues & Limitations
 1. **Fixed Worker Count**
   Currently, the number of prefill and decode workers must be fixed at the start of deployment. Dynamically adding or removing workers is not yet supported.
@@ -333,8 +350,14 @@ In the commands above, we used the FP8 variant `neuralmagic/Meta-Llama-3.1-8B-In
 4. **Experimental Patch**
   The required vLLM patch is experimental and not yet merged into upstream vLLM. Future releases may remove the need for a custom patch.
+5. **Single generate worker**
+   Only one generate worker can be used in a single deployment.
+6. **Streaming**
+   When streaming is enabled, only two responses will be returned in the stream: the first token and the complete response.
-## 8. References
+## 9. References
 [^1]: Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao
 Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language

--- a/examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
+++ b/examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
+#!/bin/bash
+export VLLM_ATTENTION_BACKEND=FLASHINFER
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export VLLM_TORCH_HOST=""
+export VLLM_TORCH_PORT=36183
+export VLLM_BASELINE_WORKERS=0
+export VLLM_CONTEXT_WORKERS=1
+export VLLM_GENERATE_WORKERS=1
+export VLLM_BASELINE_TP_SIZE=1
+export VLLM_CONTEXT_TP_SIZE=1
+export VLLM_GENERATE_TP_SIZE=1
+export VLLM_LOGGING_LEVEL=INFO
+export VLLM_DATA_PLANE_BACKEND=nccl
+export PYTHONUNBUFFERED=1
+export NATS_HOST=""
+export NATS_PORT=4223
+export NATS_STORE="$(mktemp -d)"
+export API_SERVER_HOST=""
+export API_SERVER_PORT=8005
+start_nats_server() {
+    local head_url=$1
+    export NATS_HOST="$head_url"
+    echo "Flushing NATS store: ${NATS_STORE}..."
+    rm -r "${NATS_STORE}"
+    echo "Starting NATS Server..."
+    nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
+}
+start_api_server() {
+    local head_url=$1
+    export VLLM_TORCH_HOST="$head_url"
+    echo "Starting LLM API Server..."
+    python3 -m llm.api_server \
+      --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
+      --request-plane-uri ${head_url}:${NATS_PORT} \
+      --api-server-host ${API_SERVER_HOST} \
+      --model-name llama \
+      --api-server-port ${API_SERVER_PORT} &
+}
+start_context_worker() {
+    local head_url=$1
+    export VLLM_TORCH_HOST="$head_url"
+    echo "Starting vLLM context workers..."
+    CUDA_VISIBLE_DEVICES=0 \
+    VLLM_WORKER_ID=0 \
+    python3 -m llm.vllm.deploy \
+      --context-worker-count ${VLLM_CONTEXT_WORKERS} \
+      --request-plane-uri ${head_url}:${NATS_PORT} \
+      --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
+      --kv-cache-dtype fp8 \
+      --dtype auto \
+      --worker-name llama \
+      --disable-async-output-proc \
+      --disable-log-stats \
+      --max-model-len 1000 \
+      --max-batch-size 10000 \
+      --gpu-memory-utilization 0.9 \
+      --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
+      --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
+      --log-dir "/tmp/vllm_logs" &
+}
+start_generate_worker() {
+    local head_url=$1
+    export VLLM_TORCH_HOST="$head_url"
+    echo "Starting vLLM generate workers..."
+    CUDA_VISIBLE_DEVICES=1 \
+    VLLM_WORKER_ID=1 \
+    python3 -m llm.vllm.deploy \
+      --generate-worker-count ${VLLM_GENERATE_WORKERS} \
+      --request-plane-uri ${head_url}:${NATS_PORT} \
+      --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
+      --kv-cache-dtype fp8 \
+      --dtype auto \
+      --worker-name llama \
+      --disable-async-output-proc \
+      --disable-log-stats \
+      --max-model-len 1000 \
+      --max-batch-size 10000 \
+      --gpu-memory-utilization 0.9 \
+      --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
+      --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
+      --log-dir "/tmp/vllm_logs" &
+}
+case "$1" in
+    context)
+        if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
+            echo "Usage: $0 context --head-url <head url>"
+            exit 1
+        fi
+        head_url=$3
+        export API_SERVER_HOST="$head_url"
+        start_nats_server
+        start_api_server "$head_url"
+        start_context_worker "$head_url"
+        ;;
+    generate)
+        if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
+            echo "Usage: $0 generate --head-url <head url>"
+            exit 1
+        fi
+        head_url=$3
+        export API_SERVER_HOST="$head_url"
+        start_generate_worker "$head_url"
+        ;;
+    *)
+        echo "Usage: $0 {context|generate} --head-url <head url>"
+        exit 1
+        ;;
+esac
+echo "Waiting for deployment to finish startup..."
+echo "Once you see all ranks connected to the server, it should be ready..."
+echo "Example output:"
+echo "\tRank 0 connected to the server"
+echo "\t..."
+echo "\tRank 1 connected to the server"
+sleep 120
+echo "Sending chat completions request..."
+curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "llama",
+  "messages": [
+    {"role": "system", "content": "What is the capital of France?"}
+  ],
+  "temperature": 0,
+  "top_p": 0.95,
+  "max_tokens": 25,
+  "stream": true,
+  "n": 1,
+  "frequency_penalty": 0.0,
+  "stop": []
+}'