Unverified Commit 75bf1e09 authored by Alec's avatar Alec Committed by GitHub
Browse files

docs: restructure vLLM docs and add startup banners to launch scripts (#6698)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: default avatarishandhanani <ishandhanani@gmail.com>
parent 47ed1227
...@@ -7,6 +7,25 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -7,6 +7,25 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + Approximate KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events) # run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
python -m dynamo.frontend \ python -m dynamo.frontend \
......
...@@ -10,6 +10,27 @@ export PYTHONHASHSEED=0 ...@@ -10,6 +10,27 @@ export PYTHONHASHSEED=0
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
HTTP_PORT_R1="${DYN_HTTP_PORT_R1:-8000}"
HTTP_PORT_R2="${DYN_HTTP_PORT_R2:-8001}"
echo "=========================================="
echo "Launching Aggregated + KV Routing + Replicas (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend R1: http://localhost:$HTTP_PORT_R1"
echo "Frontend R2: http://localhost:$HTTP_PORT_R2"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT_R1}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run two routers (different HTTP + system ports) # run two routers (different HTTP + system ports)
# Note: use --router-reset-states only on one router to avoid wiping shared state twice. # Note: use --router-reset-states only on one router to avoid wiping shared state twice.
......
...@@ -4,11 +4,31 @@ ...@@ -4,11 +4,31 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL="meta-llama/Meta-Llama-3.1-8B-Instruct"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Speculative Decoding (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# --------------------------- # ---------------------------
# 1. Frontend (Ingress) # 1. Frontend (Ingress)
# --------------------------- # ---------------------------
python -m dynamo.frontend --http-port=8000 & python -m dynamo.frontend --http-port="$HTTP_PORT" &
# --------------------------- # ---------------------------
...@@ -17,7 +37,7 @@ python -m dynamo.frontend --http-port=8000 & ...@@ -17,7 +37,7 @@ python -m dynamo.frontend --http-port=8000 &
# This runs the main model with EAGLE as the draft model for speculative decoding # This runs the main model with EAGLE as the draft model for speculative decoding
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \ --model "$MODEL" \
--enforce-eager \ --enforce-eager \
--speculative_config '{ --speculative_config '{
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
......
...@@ -4,6 +4,29 @@ ...@@ -4,6 +4,29 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# Common configuration
MODEL="Qwen/Qwen3-30B-A3B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Data Parallel / Expert Parallelism (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv & python -m dynamo.frontend --router-mode kv &
...@@ -15,7 +38,7 @@ python -m dynamo.frontend --router-mode kv & ...@@ -15,7 +38,7 @@ python -m dynamo.frontend --router-mode kv &
for i in {0..3}; do for i in {0..3}; do
VLLM_NIXL_SIDE_CHANNEL_PORT=$((20096 + i)) \ VLLM_NIXL_SIDE_CHANNEL_PORT=$((20096 + i)) \
CUDA_VISIBLE_DEVICES=$i python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=$i python3 -m dynamo.vllm \
--model Qwen/Qwen3-30B-A3B \ --model "$MODEL" \
--data-parallel-rank $i \ --data-parallel-rank $i \
--data-parallel-size 4 \ --data-parallel-size 4 \
--enable-expert-parallel \ --enable-expert-parallel \
......
...@@ -4,18 +4,45 @@ ...@@ -4,18 +4,45 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model "$MODEL" \
--enforce-eager \
--disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
......
...@@ -4,13 +4,34 @@ ...@@ -4,13 +4,34 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving + KVBM (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# run decode worker on GPU 0, without enabling KVBM # run decode worker on GPU 0, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager &
# run prefill worker on GPU 1 with KVBM enabled using 20GB of CPU cache # run prefill worker on GPU 1 with KVBM enabled using 20GB of CPU cache
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
...@@ -18,7 +39,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ ...@@ -18,7 +39,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_CPU_CACHE_GB=20 \ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \ --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \ --enforce-eager \
......
...@@ -4,15 +4,36 @@ ...@@ -4,15 +4,36 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KVBM 2P+2D (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress with KV router # run ingress with KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv & python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM # run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode & CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model "$MODEL" --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache # run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts # NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
...@@ -21,7 +42,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ ...@@ -21,7 +42,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
DYN_KVBM_CPU_CACHE_GB=20 \ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=2 \ CUDA_VISIBLE_DEVICES=2 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \ --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \ --enforce-eager \
...@@ -33,7 +54,7 @@ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \ ...@@ -33,7 +54,7 @@ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
DYN_KVBM_CPU_CACHE_GB=20 \ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=3 \ CUDA_VISIBLE_DEVICES=3 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \ --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \ --enforce-eager \
......
...@@ -10,6 +10,26 @@ export PYTHONHASHSEED=0 ...@@ -10,6 +10,26 @@ export PYTHONHASHSEED=0
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KVBM + KV Routing (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
......
...@@ -4,12 +4,33 @@ ...@@ -4,12 +4,33 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Qwen/Qwen3-0.6B"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Serving + LMCache (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress with KV router # run ingress with KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend --router-mode kv & python -m dynamo.frontend --router-mode kv &
# run decode worker on GPU 0, without enabling LMCache # run decode worker on GPU 0, without enabling LMCache
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model "$MODEL" &
# wait for decode worker to initialize # wait for decode worker to initialize
sleep 20 sleep 20
...@@ -18,7 +39,7 @@ sleep 20 ...@@ -18,7 +39,7 @@ sleep 20
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \ --kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -57,11 +57,36 @@ done ...@@ -57,11 +57,36 @@ done
PD_MAX_MODEL_LEN="16384" PD_MAX_MODEL_LEN="16384"
echo "==================================================" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "Disaggregated Multimodal Serving (E + PD)" echo "=========================================="
echo "==================================================" if [[ "$SINGLE_GPU" == "true" ]]; then
echo "Model: $MODEL_NAME" GPU_LABEL="1 GPU"
echo "==================================================" else
GPU_LABEL="2 GPUs"
fi
echo "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# Start frontend (no router mode) # Start frontend (no router mode)
......
...@@ -56,11 +56,36 @@ while [[ $# -gt 0 ]]; do ...@@ -56,11 +56,36 @@ while [[ $# -gt 0 ]]; do
done done
echo "==================================================" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "Disaggregated Multimodal Serving (E + P + D)" echo "=========================================="
echo "==================================================" if [[ "$SINGLE_GPU" == "true" ]]; then
echo "Model: $MODEL_NAME" GPU_LABEL="1 GPU"
echo "==================================================" else
GPU_LABEL="3 GPUs"
fi
echo "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)"
echo "=========================================="
echo "Model: $MODEL_NAME"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
# Start frontend (no router mode) # Start frontend (no router mode)
......
...@@ -47,6 +47,36 @@ done ...@@ -47,6 +47,36 @@ done
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
echo "=========================================="
echo "Model: $MODEL_NAME"
if [[ $HEAD_NODE -eq 1 ]]; then
echo "Frontend: http://localhost:$HTTP_PORT"
fi
echo "=========================================="
if [[ $HEAD_NODE -eq 1 ]]; then
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL_NAME}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
fi
echo "=========================================="
# Use TCP transport to avoid NATS payload limits for multimodal # Use TCP transport to avoid NATS payload limits for multimodal
export DYN_REQUEST_PLANE=tcp export DYN_REQUEST_PLANE=tcp
......
...@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0 ...@@ -11,6 +11,26 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KV Routing (4 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# Start frontend with KV routing # Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router # The frontend will automatically detect prefill workers and activate an internal prefill router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
......
...@@ -16,13 +16,32 @@ PT_HPU_LAZY_MODE=0 ...@@ -16,13 +16,32 @@ PT_HPU_LAZY_MODE=0
NIXL_BUFFER_DEVICE=cpu NIXL_BUFFER_DEVICE=cpu
VLLM_NIXL_BACKEND=UCX VLLM_NIXL_BACKEND=UCX
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated + KV Routing on Gaudi (4 HPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# Start frontend with KV routing # Start frontend with KV routing
# The frontend will automatically detect prefill workers and activate an internal prefill router # The frontend will automatically detect prefill workers and activate an internal prefill router
# edit --router-mode to random / round-robin / kv # edit --router-mode to random / round-robin / kv
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \ --http-port "$HTTP_PORT" \
--router-reset-states & --router-reset-states &
# two decode workers # two decode workers
......
...@@ -32,6 +32,8 @@ else ...@@ -32,6 +32,8 @@ else
exit 1 exit 1
fi fi
MODEL="Qwen/Qwen3-0.6B"
# Setup cleanup trap # Setup cleanup trap
cleanup() { cleanup() {
echo "Cleaning up background processes..." echo "Cleaning up background processes..."
...@@ -41,6 +43,26 @@ cleanup() { ...@@ -41,6 +43,26 @@ cleanup() {
} }
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Disaggregated on Same GPU (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend & python3 -m dynamo.frontend &
...@@ -53,7 +75,7 @@ DYNAMO_PID=$! ...@@ -53,7 +75,7 @@ DYNAMO_PID=$!
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--enforce-eager \ --enforce-eager \
--disaggregation-mode decode \ --disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
...@@ -75,7 +97,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ ...@@ -75,7 +97,7 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model "$MODEL" \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
......
...@@ -70,14 +70,35 @@ fi ...@@ -70,14 +70,35 @@ fi
# Calculate data parallel size # Calculate data parallel size
DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE)) DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))
echo "Configuration:" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo " Number of nodes: $NUM_NODES" echo "=========================================="
echo " Node rank: $NODE_RANK" echo "Launching DeepSeek-R1 Data Parallel (Multi-Node)"
echo " GPUs per node: $GPUS_PER_NODE" echo "=========================================="
echo " Data parallel size: $DATA_PARALLEL_SIZE" echo "Model: $MODEL"
echo " Master address: $MASTER_ADDR" if [ "$NODE_RANK" -eq 0 ]; then
echo " Log directory: $LOG_DIR" echo "Frontend: http://localhost:$HTTP_PORT"
echo " Model name: $MODEL" fi
echo "Number of nodes: $NUM_NODES"
echo "Node rank: $NODE_RANK"
echo "GPUs per node: $GPUS_PER_NODE"
echo "Data parallel: $DATA_PARALLEL_SIZE"
echo "Master address: $MASTER_ADDR"
echo "Log directory: $LOG_DIR"
echo "=========================================="
if [ "$NODE_RANK" -eq 0 ]; then
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Explain why Roger Federer is considered one of the greatest tennis players of all time\"}],"
echo " \"max_tokens\": 32"
echo " }'"
echo ""
echo "=========================================="
fi
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
......
...@@ -16,47 +16,53 @@ export DYN_LORA_PATH=/tmp/dynamo_loras_minio ...@@ -16,47 +16,53 @@ export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p $DYN_LORA_PATH mkdir -p $DYN_LORA_PATH
MODEL="Qwen/Qwen3-0.6B"
SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Serving + LoRA (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Test base model inference (for comparison)"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Unload LoRA"
echo " curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
echo ""
echo "=========================================="
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var. # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
python -m dynamo.frontend & python -m dynamo.frontend &
# run worker # run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \ python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--enable-lora \ --enable-lora \
--max-lora-rank 64 --max-lora-rank 64
################################## Example Usage ##################################
# Check available models
curl http://localhost:8000/v1/models | jq .
# Load LoRA using s3 uri
curl -s -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
# Test LoRA inference
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [{"role": "user", "content": "What is deep learning?"}],
"max_tokens": 300,
"temperature": 0.0
}'
# Test base model inference (for comparison)
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [{"role": "user", "content": "What is deep learning?"}],
"max_tokens": 300,
"temperature": 0.0
}'
# Unload LoRA
curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora
...@@ -23,6 +23,41 @@ export PYTHONHASHSEED=0 ...@@ -23,6 +23,41 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64 BLOCK_SIZE=64
SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated + LoRA + KV Routing (2 GPUs)"
echo "=========================================="
echo "Model: $MODEL"
echo "Frontend: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA to both instances (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
echo " \"max_tokens\": 32}' | jq ."
echo ""
echo "=========================================="
# run frontend + KV router # run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \ python -m dynamo.frontend \
...@@ -31,7 +66,7 @@ python -m dynamo.frontend \ ...@@ -31,7 +66,7 @@ python -m dynamo.frontend \
# run workers # run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
...@@ -40,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -40,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--max-lora-rank 64 \ --max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
...@@ -50,42 +85,9 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -50,42 +85,9 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--max-lora-rank 64 \ --max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
# below commands are not executed automatically in the script because previous backend launch command is blocking. # Sample output after running LoRA inference curl request twice.
################################## Example Usage ##################################
# Check available models
curl http://localhost:8000/v1/models | jq .
# Load LoRA to instances using s3 uri
curl -s -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
curl -s -X POST http://localhost:8082/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"}}' | jq .
# Test LoRA inference
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream": false,
"max_tokens": 30
}' | jq .
# Sample output after running above curl request twice.
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request. # usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
{ {
"id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc", "id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
"choices": [ "choices": [
...@@ -118,3 +120,4 @@ curl localhost:8000/v1/chat/completions \ ...@@ -118,3 +120,4 @@ curl localhost:8000/v1/chat/completions \
} }
} }
} }
SAMPLE_OUTPUT
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
CAPACITY_GB=10 CAPACITY_GB=10
EXTRA_ARGS=() EXTRA_ARGS=()
...@@ -25,8 +26,35 @@ if [[ "$CAPACITY_GB" != "0" ]]; then ...@@ -25,8 +26,35 @@ if [[ "$CAPACITY_GB" != "0" ]]; then
}") }")
fi fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching vLLM Serve + Embedding Cache (1 GPU)"
echo "=========================================="
echo "Model: $MODEL"
echo "Server: http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{"
echo " \"model\": \"${MODEL}\","
echo " \"messages\": [{"
echo " \"role\": \"user\","
echo " \"content\": ["
echo " {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo " ]"
echo " }],"
echo " \"max_tokens\": 50"
echo " }'"
echo ""
echo "=========================================="
CUDA_VISIBLE_DEVICES=2 \ CUDA_VISIBLE_DEVICES=2 \
vllm serve Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 \ vllm serve $MODEL \
--port "$HTTP_PORT" \
--enable-log-requests \ --enable-log-requests \
--max-model-len 16384 \ --max-model-len 16384 \
--gpu-memory-utilization .9 \ --gpu-memory-utilization .9 \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment