Unverified Commit 75bf1e09 authored by Alec's avatar Alec Committed by GitHub
Browse files

docs: restructure vLLM docs and add startup banners to launch scripts (#6698)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: default avatarishandhanani <ishandhanani@gmail.com>
parent 47ed1227
...@@ -78,7 +78,7 @@ echo " -H 'Content-Type: application/json' \\" ...@@ -78,7 +78,7 @@ echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL}\"," echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": [" echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"}," echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}" echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}]," echo " ]}],"
echo " \"max_tokens\": 50" echo " \"max_tokens\": 50"
......
...@@ -48,7 +48,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" ...@@ -48,7 +48,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL_PATH}\"," echo " \"model\": \"${MODEL_PATH}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello! How are you?\"}]," echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"temperature\": 0.7," echo " \"temperature\": 0.7,"
echo " \"max_tokens\": 512" echo " \"max_tokens\": 512"
echo " }'" echo " }'"
......
...@@ -63,7 +63,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" ...@@ -63,7 +63,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL}\"," echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32" echo " \"max_tokens\": 32"
echo " }'" echo " }'"
echo "" echo ""
......
...@@ -64,7 +64,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" ...@@ -64,7 +64,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL}\"," echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32" echo " \"max_tokens\": 32"
echo " }'" echo " }'"
echo "" echo ""
......
...@@ -55,7 +55,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" ...@@ -55,7 +55,7 @@ echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL}\"," echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32" echo " \"max_tokens\": 32"
echo " }'" echo " }'"
echo "" echo ""
......
...@@ -85,7 +85,7 @@ echo "" ...@@ -85,7 +85,7 @@ echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/images/generations \\" echo " curl http://localhost:${HTTP_PORT}/v1/images/generations \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"prompt\": \"A curious raccoon exploring a garden\"," echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
echo " \"model\": \"${MODEL_PATH}\"," echo " \"model\": \"${MODEL_PATH}\","
echo " \"size\": \"1024x1024\"," echo " \"size\": \"1024x1024\","
echo " \"response_format\": \"url\"," echo " \"response_format\": \"url\","
......
...@@ -77,7 +77,7 @@ echo " -H 'Content-Type: application/json' \\" ...@@ -77,7 +77,7 @@ echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\"," echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": [" echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"}," echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}" echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}]," echo " ]}],"
echo " \"max_tokens\": 50" echo " \"max_tokens\": 50"
......
...@@ -77,7 +77,7 @@ echo " -H 'Content-Type: application/json' \\" ...@@ -77,7 +77,7 @@ echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\"," echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": [" echo " \"messages\": [{\"role\": \"user\", \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"}," echo " {\"type\": \"text\", \"text\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}" echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"http://images.cocodataset.org/test2017/000000155781.jpg\"}}"
echo " ]}]," echo " ]}],"
echo " \"max_tokens\": 50" echo " \"max_tokens\": 50"
......
...@@ -113,7 +113,7 @@ echo "" ...@@ -113,7 +113,7 @@ echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/videos \\" echo " curl http://localhost:${HTTP_PORT}/v1/videos \\"
echo " -H 'Content-Type: application/json' \\" echo " -H 'Content-Type: application/json' \\"
echo " -d '{" echo " -d '{"
echo " \"prompt\": \"A curious raccoon exploring a garden\"," echo " \"prompt\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\","
echo " \"model\": \"${MODEL_PATH}\"," echo " \"model\": \"${MODEL_PATH}\","
echo " \"seconds\": 2," echo " \"seconds\": 2,"
echo " \"size\": \"${WIDTH}x${HEIGHT}\"," echo " \"size\": \"${WIDTH}x${HEIGHT}\","
......
...@@ -22,6 +22,26 @@ while [[ $# -gt 0 ]]; do ...@@ -22,6 +22,26 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
......
...@@ -4,6 +4,27 @@ ...@@ -4,6 +4,27 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + KVBM (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
...@@ -11,4 +32,4 @@ python -m dynamo.frontend & ...@@ -11,4 +32,4 @@ python -m dynamo.frontend &
# run worker with KVBM enabled # run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
DYN_KVBM_CPU_CACHE_GB=20 \ DYN_KVBM_CPU_CACHE_GB=20 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager
...@@ -9,6 +9,25 @@ export PYTHONHASHSEED=0 ...@@ -9,6 +9,25 @@ export PYTHONHASHSEED=0
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + KVBM + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run frontend + KV router # run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
...@@ -7,10 +7,31 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -7,10 +7,31 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally # Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
unset PROMETHEUS_MULTIPROC_DIR unset PROMETHEUS_MULTIPROC_DIR
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + LMCache (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally) # run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally)
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
...@@ -17,6 +17,27 @@ cleanup() { ...@@ -17,6 +17,27 @@ cleanup() {
} }
trap cleanup EXIT trap cleanup EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + LMCache + Multiproc (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
...@@ -24,5 +45,5 @@ python -m dynamo.frontend & ...@@ -24,5 +45,5 @@ python -m dynamo.frontend &
# run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set # run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \ PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' python -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
...@@ -43,6 +43,32 @@ while [[ $# -gt 0 ]]; do ...@@ -43,6 +43,32 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Multimodal Serving"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# Use TCP transport (instead of default NATS) # Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes: # TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this) # - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
......
...@@ -29,15 +29,28 @@ while [[ $# -gt 0 ]]; do ...@@ -29,15 +29,28 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Text-to-Text (1 GPU)"
echo "==========================================" echo "=========================================="
echo "Starting vLLM-Omni Worker (Text-to-Text)"
echo "Model: $MODEL" echo "Model: $MODEL"
echo "Stage Config: $STAGE_CONFIG" echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "==========================================" echo "=========================================="
# Run ingress (frontend) # Run ingress (frontend)
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
python -m dynamo.frontend & python -m dynamo.frontend &
FRONTEND_PID=$! FRONTEND_PID=$!
......
...@@ -22,12 +22,26 @@ while [[ $# -gt 0 ]]; do ...@@ -22,12 +22,26 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Image Generation (1 GPU)"
echo "==========================================" echo "=========================================="
echo "Starting vLLM-Omni Worker"
echo "Model: $MODEL" echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Generate an image of a sunset over mountains.\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "==========================================" echo "=========================================="
echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
python -m dynamo.frontend & python -m dynamo.frontend &
FRONTEND_PID=$! FRONTEND_PID=$!
......
...@@ -23,13 +23,26 @@ while [[ $# -gt 0 ]]; do ...@@ -23,13 +23,26 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM-Omni Video Generation (1 GPU)"
echo "==========================================" echo "=========================================="
echo "Starting vLLM-Omni Worker"
echo "Model: $MODEL" echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Generate a short video of ocean waves.\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "==========================================" echo "=========================================="
echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
python -m dynamo.frontend & python -m dynamo.frontend &
FRONTEND_PID=$! FRONTEND_PID=$!
......
...@@ -36,14 +36,36 @@ while [[ $# -gt 0 ]]; do ...@@ -36,14 +36,36 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
MODEL="Qwen/Qwen3-0.6B"
# Set the request plane mode # Set the request plane mode
export DYN_REQUEST_PLANE=$REQUEST_PLANE export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE" echo "Using request plane mode: $REQUEST_PLANE"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + Request Planes (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# Frontend # Frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \ DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager python -m dynamo.vllm --model "$MODEL" --enforce-eager
...@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0 ...@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run frontend + KV router # run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment