docs: restructure vLLM docs and add startup banners to launch scripts (#6698)

Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: ishandhanani <ishandhanani@gmail.com>

docs: restructure vLLM docs and add startup banners to launch scripts (#6698)
Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: ishandhanani <ishandhanani@gmail.com>
75bf1e09 · Alec · GitHub · 47ed1227 · 75bf1e09 · 75bf1e09
Unverified Commit 75bf1e09 authored Mar 02, 2026 by Alec Committed by GitHub Mar 03, 2026
19 changed files
--- a/examples/backends/vllm/launch/agg_router_approx.sh
+++ b/examples/backends/vllm/launch/agg_router_approx.sh
@@ -7,6 +7,25 @@ trap 'echo Cleaning up...; kill 0' EXIT
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated + Approximate KV Routing (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
 python -m dynamo.frontend \

--- a/examples/backends/vllm/launch/agg_router_replicas.sh
+++ b/examples/backends/vllm/launch/agg_router_replicas.sh
@@ -10,6 +10,27 @@ export PYTHONHASHSEED=0
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64
+HTTP_PORT_R1="${DYN_HTTP_PORT_R1:-8000}"
+HTTP_PORT_R2="${DYN_HTTP_PORT_R2:-8001}"
+echo "=========================================="
+echo "Launching Aggregated + KV Routing + Replicas (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend R1: http://localhost:$HTTP_PORT_R1"
+echo "Frontend R2: http://localhost:$HTTP_PORT_R2"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT_R1}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run two routers (different HTTP + system ports)
 # Note: use --router-reset-states only on one router to avoid wiping shared state twice.

--- a/examples/backends/vllm/launch/agg_spec_decoding.sh
+++ b/examples/backends/vllm/launch/agg_spec_decoding.sh
@@ -4,11 +4,31 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Speculative Decoding (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # ---------------------------
 # 1. Frontend (Ingress)
 # ---------------------------
-python -m dynamo.frontend --http-port=8000 &
+python -m dynamo.frontend --http-port="$HTTP_PORT" &
 # ---------------------------
@@ -17,7 +37,7 @@ python -m dynamo.frontend --http-port=8000 &
 # This runs the main model with EAGLE as the draft model for speculative decoding
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
 CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model "$MODEL" \
    --enforce-eager \
    --speculative_config '{
        "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",

--- a/examples/backends/vllm/launch/dep.sh
+++ b/examples/backends/vllm/launch/dep.sh
@@ -4,6 +4,29 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+# Common configuration
+MODEL="Qwen/Qwen3-30B-A3B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Data Parallel / Expert Parallelism (4 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend --router-mode kv &
@@ -15,7 +38,7 @@ python -m dynamo.frontend --router-mode kv &
 for i in {0..3}; do
    VLLM_NIXL_SIDE_CHANNEL_PORT=$((20096 + i)) \
    CUDA_VISIBLE_DEVICES=$i python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-30B-A3B \
+    --model "$MODEL" \
    --data-parallel-rank $i \
    --data-parallel-size 4 \
    --enable-expert-parallel \

--- a/examples/backends/vllm/launch/disagg.sh
+++ b/examples/backends/vllm/launch/disagg.sh
@@ -4,18 +4,45 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+# Common configuration
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated Serving (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
- DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
- CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
+    --model "$MODEL" \
+    --enforce-eager \
+    --disaggregation-mode decode \
+    --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
+    --model "$MODEL" \
    --enforce-eager \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \

--- a/examples/backends/vllm/launch/disagg_kvbm.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm.sh
@@ -4,13 +4,34 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated Serving + KVBM (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
 # run decode worker on GPU 0, without enabling KVBM
 # NOTE: remove --enforce-eager for production use
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager &
 # run prefill worker on GPU 1 with KVBM enabled using 20GB of CPU cache
 # NOTE: remove --enforce-eager for production use
@@ -18,7 +39,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 DYN_KVBM_CPU_CACHE_GB=20 \
 CUDA_VISIBLE_DEVICES=1 \
  python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
+    --model "$MODEL" \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --enforce-eager \

--- a/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
@@ -4,15 +4,36 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated + KVBM 2P+2D (4 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress with KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend --router-mode kv &
 # run decode workers on GPU 0 and 1, without enabling KVBM
 # NOTE: remove --enforce-eager for production use
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
-CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
 # run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
 # NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
@@ -21,7 +42,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
 DYN_KVBM_CPU_CACHE_GB=20 \
 CUDA_VISIBLE_DEVICES=2 \
  python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
+    --model "$MODEL" \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --enforce-eager \
@@ -33,7 +54,7 @@ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
 DYN_KVBM_CPU_CACHE_GB=20 \
 CUDA_VISIBLE_DEVICES=3 \
  python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
+    --model "$MODEL" \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --enforce-eager \

--- a/examples/backends/vllm/launch/disagg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_router.sh
@@ -10,6 +10,26 @@ export PYTHONHASHSEED=0
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated + KVBM + KV Routing (4 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend \
    --router-mode kv \

--- a/examples/backends/vllm/launch/disagg_lmcache.sh
+++ b/examples/backends/vllm/launch/disagg_lmcache.sh
@@ -4,12 +4,33 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated Serving + LMCache (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress with KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend --router-mode kv &
 # run decode worker on GPU 0, without enabling LMCache
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" &
 # wait for decode worker to initialize
 sleep 20
@@ -18,7 +39,7 @@ sleep 20
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=1 \
  python3 -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
+    --model "$MODEL" \
    --disaggregation-mode prefill \
    --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
--- a/examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
@@ -57,11 +57,36 @@ done
 PD_MAX_MODEL_LEN="16384"
-echo "=================================================="
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "Disaggregated Multimodal Serving (E + PD)"
+echo "=========================================="
-echo "=================================================="
+if [[ "$SINGLE_GPU" == "true" ]]; then
-echo "Model: $MODEL_NAME"
+    GPU_LABEL="1 GPU"
-echo "=================================================="
+else
+    GPU_LABEL="2 GPUs"
+fi
+echo "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)"
+echo "=========================================="
+echo "Model:       $MODEL_NAME"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL_NAME}\","
+echo "      \"messages\": [{"
+echo "        \"role\": \"user\","
+echo "        \"content\": ["
+echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
+echo "        ]"
+echo "      }],"
+echo "      \"max_tokens\": 50"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Start frontend (no router mode)

--- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -56,11 +56,36 @@ while [[ $# -gt 0 ]]; do
 done
-echo "=================================================="
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "Disaggregated Multimodal Serving (E + P + D)"
+echo "=========================================="
-echo "=================================================="
+if [[ "$SINGLE_GPU" == "true" ]]; then
-echo "Model: $MODEL_NAME"
+    GPU_LABEL="1 GPU"
-echo "=================================================="
+else
+    GPU_LABEL="3 GPUs"
+fi
+echo "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)"
+echo "=========================================="
+echo "Model:       $MODEL_NAME"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL_NAME}\","
+echo "      \"messages\": [{"
+echo "        \"role\": \"user\","
+echo "        \"content\": ["
+echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
+echo "        ]"
+echo "      }],"
+echo "      \"max_tokens\": 50"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Start frontend (no router mode)

--- a/examples/backends/vllm/launch/disagg_multimodal_llama.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_llama.sh
@@ -47,6 +47,36 @@ done
 trap 'echo Cleaning up...; kill 0' EXIT
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
+echo "=========================================="
+echo "Model:       $MODEL_NAME"
+if [[ $HEAD_NODE -eq 1 ]]; then
+echo "Frontend:    http://localhost:$HTTP_PORT"
+fi
+echo "=========================================="
+if [[ $HEAD_NODE -eq 1 ]]; then
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL_NAME}\","
+echo "      \"messages\": [{"
+echo "        \"role\": \"user\","
+echo "        \"content\": ["
+echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
+echo "        ]"
+echo "      }],"
+echo "      \"max_tokens\": 50"
+echo "    }'"
+echo ""
+fi
+echo "=========================================="
 # Use TCP transport to avoid NATS payload limits for multimodal
 export DYN_REQUEST_PLANE=tcp

--- a/examples/backends/vllm/launch/disagg_router.sh
+++ b/examples/backends/vllm/launch/disagg_router.sh
@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0
 MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated + KV Routing (4 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Start frontend with KV routing
 # The frontend will automatically detect prefill workers and activate an internal prefill router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)

--- a/examples/backends/vllm/launch/disagg_router_gaudi.sh
+++ b/examples/backends/vllm/launch/disagg_router_gaudi.sh
@@ -16,13 +16,32 @@ PT_HPU_LAZY_MODE=0
 NIXL_BUFFER_DEVICE=cpu
 VLLM_NIXL_BACKEND=UCX
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Start frontend with KV routing
 # The frontend will automatically detect prefill workers and activate an internal prefill router
 # edit --router-mode to random / round-robin / kv
 python -m dynamo.frontend \
    --router-mode kv \
-    --http-port 8000 \
+    --http-port "$HTTP_PORT" \
    --router-reset-states &
 # two decode workers

--- a/examples/backends/vllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/vllm/launch/disagg_same_gpu.sh
@@ -32,6 +32,8 @@ else
  exit 1
 fi
+MODEL="Qwen/Qwen3-0.6B"
 # Setup cleanup trap
 cleanup() {
    echo "Cleaning up background processes..."
@@ -41,6 +43,26 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Disaggregated on Same GPU (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python3 -m dynamo.frontend &
@@ -53,7 +75,7 @@ DYNAMO_PID=$!
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 CUDA_VISIBLE_DEVICES=0 \
 python3 -m dynamo.vllm \
-  --model Qwen/Qwen3-0.6B \
+  --model "$MODEL" \
  --enforce-eager \
  --disaggregation-mode decode \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
@@ -75,7 +97,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=0 \
 python3 -m dynamo.vllm \
-  --model Qwen/Qwen3-0.6B \
+  --model "$MODEL" \
  --enforce-eager \
  --disaggregation-mode prefill \
  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \

--- a/examples/backends/vllm/launch/dsr1_dep.sh
+++ b/examples/backends/vllm/launch/dsr1_dep.sh
@@ -70,14 +70,35 @@ fi
 # Calculate data parallel size
 DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))
-echo "Configuration:"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
-echo "  Number of nodes: $NUM_NODES"
+echo "=========================================="
-echo "  Node rank: $NODE_RANK"
+echo "Launching DeepSeek-R1 Data Parallel (Multi-Node)"
-echo "  GPUs per node: $GPUS_PER_NODE"
+echo "=========================================="
-echo "  Data parallel size: $DATA_PARALLEL_SIZE"
+echo "Model:       $MODEL"
-echo "  Master address: $MASTER_ADDR"
+if [ "$NODE_RANK" -eq 0 ]; then
-echo "  Log directory: $LOG_DIR"
+echo "Frontend:    http://localhost:$HTTP_PORT"
-echo "  Model name: $MODEL"
+fi
+echo "Number of nodes: $NUM_NODES"
+echo "Node rank:       $NODE_RANK"
+echo "GPUs per node:   $GPUS_PER_NODE"
+echo "Data parallel:   $DATA_PARALLEL_SIZE"
+echo "Master address:  $MASTER_ADDR"
+echo "Log directory:   $LOG_DIR"
+echo "=========================================="
+if [ "$NODE_RANK" -eq 0 ]; then
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
+fi
 trap 'echo Cleaning up...; kill 0' EXIT

--- a/examples/backends/vllm/launch/lora/agg_lora.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora.sh
@@ -16,47 +16,53 @@ export DYN_LORA_PATH=/tmp/dynamo_loras_minio
 mkdir -p $DYN_LORA_PATH
+MODEL="Qwen/Qwen3-0.6B"
+SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Serving + LoRA (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Once running, test with:"
+echo ""
+echo "  # Check available models"
+echo "  curl http://localhost:${HTTP_PORT}/v1/models | jq ."
+echo ""
+echo "  # Load LoRA (using S3 URI)"
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  # Test LoRA inference"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
+echo "         \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
+echo ""
+echo "  # Test base model inference (for comparison)"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"${MODEL}\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
+echo "         \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
+echo ""
+echo "  # Unload LoRA"
+echo "  curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
 python -m dynamo.frontend &
 # run worker
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
-DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
-    python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager  \
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
-    --enable-lora  \
+    --enable-lora \
    --max-lora-rank 64
-################################## Example Usage ##################################
-# Check available models
-curl http://localhost:8000/v1/models | jq .
-# Load LoRA using s3 uri
-curl -s  -X POST http://localhost:8081/v1/loras \
-       -H "Content-Type: application/json" \
-       -d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-     "source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
-# Test LoRA inference
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-    "messages": [{"role": "user", "content": "What is deep learning?"}],
-    "max_tokens": 300,
-    "temperature": 0.0
-  }'
-# Test base model inference (for comparison)
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen3-0.6B",
-    "messages": [{"role": "user", "content": "What is deep learning?"}],
-    "max_tokens": 300,
-    "temperature": 0.0
-  }'
-# Unload LoRA
-curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora
--- a/examples/backends/vllm/launch/lora/agg_lora_router.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora_router.sh
@@ -23,6 +23,41 @@ export PYTHONHASHSEED=0
 MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64
+SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
+SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated + LoRA + KV Routing (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Once running, test with:"
+echo ""
+echo "  # Check available models"
+echo "  curl http://localhost:${HTTP_PORT}/v1/models | jq ."
+echo ""
+echo "  # Load LoRA to both instances (using S3 URI)"
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  # Test LoRA inference"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+echo "         \"max_tokens\": 32}' | jq ."
+echo ""
+echo "=========================================="
 # run frontend + KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend \
@@ -31,7 +66,7 @@ python -m dynamo.frontend \
 # run workers
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
-DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --model $MODEL \
    --block-size $BLOCK_SIZE \
@@ -40,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --max-lora-rank 64 \
    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
-DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model $MODEL \
@@ -50,42 +85,9 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --max-lora-rank 64 \
    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
-# below commands are not executed automatically in the script because previous backend launch command is blocking.
+# Sample output after running LoRA inference curl request twice.
-################################## Example Usage ##################################
-# Check available models
-curl http://localhost:8000/v1/models | jq .
-# Load LoRA to instances using s3 uri
-curl -s  -X POST http://localhost:8081/v1/loras \
-       -H "Content-Type: application/json" \
-       -d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-     "source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
-curl -s  -X POST http://localhost:8082/v1/loras \
-       -H "Content-Type: application/json" \
-       -d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-     "source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
- # Test LoRA inference
-curl localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-    "messages": [
-    {
-        "role": "user",
-        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
-    }
-    ],
-    "stream": false,
-    "max_tokens": 30
-  }' | jq .
- # Sample output after running above curl request twice.
 # usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
+ : <<'SAMPLE_OUTPUT'
 {
  "id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
  "choices": [
@@ -118,3 +120,4 @@ curl localhost:8000/v1/chat/completions \
    }
  }
 }
+SAMPLE_OUTPUT
--- a/examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
+++ b/examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
+MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
 CAPACITY_GB=10
 EXTRA_ARGS=()
@@ -25,8 +26,35 @@ if [[ "$CAPACITY_GB" != "0" ]]; then
    }")
 fi
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching vLLM Serve + Embedding Cache (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Server:      http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{"
+echo "        \"role\": \"user\","
+echo "        \"content\": ["
+echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
+echo "        ]"
+echo "      }],"
+echo "      \"max_tokens\": 50"
+echo "    }'"
+echo ""
+echo "=========================================="
 CUDA_VISIBLE_DEVICES=2 \
-vllm serve Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 \
+vllm serve $MODEL \
+    --port "$HTTP_PORT" \
    --enable-log-requests \
    --max-model-len 16384 \
    --gpu-memory-utilization .9 \