docs: restructure vLLM docs and add startup banners to launch scripts (#6698)

Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: ishandhanani <ishandhanani@gmail.com>

docs: restructure vLLM docs and add startup banners to launch scripts (#6698)
Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: ishandhanani <ishandhanani@gmail.com>
75bf1e09 · Alec · GitHub · 47ed1227 · 75bf1e09 · 75bf1e09
Unverified Commit 75bf1e09 authored Mar 02, 2026 by Alec Committed by GitHub Mar 03, 2026
20 changed files
--- a/examples/backends/sglang/launch/agg_vision.sh
+++ b/examples/backends/sglang/launch/agg_vision.sh
@@ -78,7 +78,7 @@ echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL}\","
 echo "      \"messages\": [{\"role\": \"user\", \"content\": ["
-echo "        {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "        {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
 echo "        {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
 echo "      ]}],"
 echo "      \"max_tokens\": 50"

--- a/examples/backends/sglang/launch/diffusion_llada.sh
+++ b/examples/backends/sglang/launch/diffusion_llada.sh
@@ -48,7 +48,7 @@ echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL_PATH}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Hello! How are you?\"}],"
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
 echo "      \"temperature\": 0.7,"
 echo "      \"max_tokens\": 512"
 echo "    }'"

--- a/examples/backends/sglang/launch/disagg.sh
+++ b/examples/backends/sglang/launch/disagg.sh
@@ -63,7 +63,7 @@ echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
 echo "      \"max_tokens\": 32"
 echo "    }'"
 echo ""

--- a/examples/backends/sglang/launch/disagg_router.sh
+++ b/examples/backends/sglang/launch/disagg_router.sh
@@ -64,7 +64,7 @@ echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
 echo "      \"max_tokens\": 32"
 echo "    }'"
 echo ""

--- a/examples/backends/sglang/launch/disagg_same_gpu.sh
+++ b/examples/backends/sglang/launch/disagg_same_gpu.sh
@@ -55,7 +55,7 @@ echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL}\","
-echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
 echo "      \"max_tokens\": 32"
 echo "    }'"
 echo ""

--- a/examples/backends/sglang/launch/image_diffusion.sh
+++ b/examples/backends/sglang/launch/image_diffusion.sh
@@ -85,7 +85,7 @@ echo ""
 echo "  curl http://localhost:${HTTP_PORT}/v1/images/generations \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
-echo "      \"prompt\": \"A curious raccoon exploring a garden\","
+echo "      \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
 echo "      \"model\": \"${MODEL_PATH}\","
 echo "      \"size\": \"1024x1024\","
 echo "      \"response_format\": \"url\","

--- a/examples/backends/sglang/launch/multimodal_disagg.sh
+++ b/examples/backends/sglang/launch/multimodal_disagg.sh
@@ -77,7 +77,7 @@ echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL_NAME}\","
 echo "      \"messages\": [{\"role\": \"user\", \"content\": ["
-echo "        {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "        {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
 echo "        {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
 echo "      ]}],"
 echo "      \"max_tokens\": 50"

--- a/examples/backends/sglang/launch/multimodal_epd.sh
+++ b/examples/backends/sglang/launch/multimodal_epd.sh
@@ -77,7 +77,7 @@ echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
 echo "      \"model\": \"${MODEL_NAME}\","
 echo "      \"messages\": [{\"role\": \"user\", \"content\": ["
-echo "        {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "        {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
 echo "        {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
 echo "      ]}],"
 echo "      \"max_tokens\": 50"

--- a/examples/backends/sglang/launch/text-to-video-diffusion.sh
+++ b/examples/backends/sglang/launch/text-to-video-diffusion.sh
@@ -113,7 +113,7 @@ echo ""
 echo "  curl http://localhost:${HTTP_PORT}/v1/videos \\"
 echo "    -H 'Content-Type: application/json' \\"
 echo "    -d '{"
-echo "      \"prompt\": \"A curious raccoon exploring a garden\","
+echo "      \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
 echo "      \"model\": \"${MODEL_PATH}\","
 echo "      \"seconds\": 2,"
 echo "      \"size\": \"${WIDTH}x${HEIGHT}\","

--- a/examples/backends/vllm/launch/agg.sh
+++ b/examples/backends/vllm/launch/agg.sh
@@ -22,6 +22,26 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Serving (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &

--- a/examples/backends/vllm/launch/agg_kvbm.sh
+++ b/examples/backends/vllm/launch/agg_kvbm.sh
@@ -4,6 +4,27 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Serving + KVBM (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
@@ -11,4 +32,4 @@ python -m dynamo.frontend &
 # run worker with KVBM enabled
 # NOTE: remove --enforce-eager for production use
 DYN_KVBM_CPU_CACHE_GB=20 \
-  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager
+  python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager
--- a/examples/backends/vllm/launch/agg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/agg_kvbm_router.sh
@@ -9,6 +9,25 @@ export PYTHONHASHSEED=0
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated + KVBM + KV Routing (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run frontend + KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)

--- a/examples/backends/vllm/launch/agg_lmcache.sh
+++ b/examples/backends/vllm/launch/agg_lmcache.sh
@@ -7,10 +7,31 @@ trap 'echo Cleaning up...; kill 0' EXIT
 # Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
 unset PROMETHEUS_MULTIPROC_DIR
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Serving + LMCache (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
 # run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally)
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
-  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
+  python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
--- a/examples/backends/vllm/launch/agg_lmcache_multiproc.sh
+++ b/examples/backends/vllm/launch/agg_lmcache_multiproc.sh
@@ -17,6 +17,27 @@ cleanup() {
 }
 trap cleanup EXIT
+MODEL="Qwen/Qwen3-0.6B"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated + LMCache + Multiproc (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
@@ -24,5 +45,5 @@ python -m dynamo.frontend &
 # run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
  PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
-  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
+  python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -43,6 +43,32 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Multimodal Serving"
+echo "=========================================="
+echo "Model:       $MODEL_NAME"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL_NAME}\","
+echo "      \"messages\": [{"
+echo "        \"role\": \"user\","
+echo "        \"content\": ["
+echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
+echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
+echo "        ]"
+echo "      }],"
+echo "      \"max_tokens\": 50"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Use TCP transport (instead of default NATS)
 # TCP is preferred for multimodal workloads because it overcomes:
 # - NATS default 1MB max payload limit (multimodal base64 images can exceed this)

--- a/examples/backends/vllm/launch/agg_omni.sh
+++ b/examples/backends/vllm/launch/agg_omni.sh
@@ -29,15 +29,28 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching vLLM-Omni Text-to-Text (1 GPU)"
 echo "=========================================="
-echo "Starting vLLM-Omni Worker (Text-to-Text)"
 echo "Model:       $MODEL"
-echo "Stage Config: $STAGE_CONFIG"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
 echo "=========================================="
 # Run ingress (frontend)
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
-echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
 python -m dynamo.frontend &
 FRONTEND_PID=$!

--- a/examples/backends/vllm/launch/agg_omni_image.sh
+++ b/examples/backends/vllm/launch/agg_omni_image.sh
@@ -22,12 +22,26 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching vLLM-Omni Image Generation (1 GPU)"
 echo "=========================================="
-echo "Starting vLLM-Omni Worker"
 echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Generate an image of a sunset over mountains.\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
 echo "=========================================="
-echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
 python -m dynamo.frontend &
 FRONTEND_PID=$!

--- a/examples/backends/vllm/launch/agg_omni_video.sh
+++ b/examples/backends/vllm/launch/agg_omni_video.sh
@@ -23,13 +23,26 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching vLLM-Omni Video Generation (1 GPU)"
 echo "=========================================="
-echo "Starting vLLM-Omni Worker"
 echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Generate a short video of ocean waves.\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
 echo "=========================================="
-echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
 python -m dynamo.frontend &
 FRONTEND_PID=$!

--- a/examples/backends/vllm/launch/agg_request_planes.sh
+++ b/examples/backends/vllm/launch/agg_request_planes.sh
@@ -36,14 +36,36 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+MODEL="Qwen/Qwen3-0.6B"
 # Set the request plane mode
 export DYN_REQUEST_PLANE=$REQUEST_PLANE
 echo "Using request plane mode: $REQUEST_PLANE"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated Serving + Request Planes (1 GPU)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # Frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
 DYN_HEALTH_CHECK_ENABLED=true \
-    python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager
--- a/examples/backends/vllm/launch/agg_router.sh
+++ b/examples/backends/vllm/launch/agg_router.sh
@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0
 MODEL="Qwen/Qwen3-0.6B"
 BLOCK_SIZE=64
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+echo "=========================================="
+echo "Launching Aggregated + KV Routing (2 GPUs)"
+echo "=========================================="
+echo "Model:       $MODEL"
+echo "Frontend:    http://localhost:$HTTP_PORT"
+echo "=========================================="
+echo ""
+echo "Example test command:"
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{"
+echo "      \"model\": \"${MODEL}\","
+echo "      \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
+echo "      \"max_tokens\": 32"
+echo "    }'"
+echo ""
+echo "=========================================="
 # run frontend + KV router
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend \