launch.sh

#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Launch script for vLLM MM Router Worker demo:
#   Frontend (round-robin) -> MM Router Worker -> vLLM backend
#
# This script is intended as a step-by-step runnable demo on a single machine.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DYNAMO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
cd "${DYNAMO_ROOT}"

# ---------------------------------------------------------------------------
# Configuration (override with environment variables)
# ---------------------------------------------------------------------------
MODEL="${MODEL:-Qwen/Qwen3-VL-8B-Instruct}"
NAMESPACE="${NAMESPACE:-dynamo}"
HTTP_PORT="${HTTP_PORT:-8000}"
BLOCK_SIZE="${BLOCK_SIZE:-16}"            # Must match vLLM backend KV block size
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.85}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"

NATS_SERVER="${NATS_SERVER:-nats://127.0.0.1:4222}"
ETCD_ENDPOINTS="${ETCD_ENDPOINTS:-http://127.0.0.1:2379}"

VLLM_SYSTEM_PORT="${VLLM_SYSTEM_PORT:-18081}"
MM_ROUTER_SYSTEM_PORT="${MM_ROUTER_SYSTEM_PORT:-18082}"

MM_ROUTER_COMPONENT="${MM_ROUTER_COMPONENT:-mm_router}"
BACKEND_COMPONENT="${BACKEND_COMPONENT:-backend}"  # dynamo.vllm default

# Extra args (word-splitting is intentional for shell-style overrides)
VLLM_EXTRA_ARGS="${VLLM_EXTRA_ARGS:-}"
FRONTEND_EXTRA_ARGS="${FRONTEND_EXTRA_ARGS:-}"
MM_ROUTER_EXTRA_ARGS="${MM_ROUTER_EXTRA_ARGS:-}"

echo "=== vLLM MM Router Worker Launch Script ==="
echo "Working directory: ${DYNAMO_ROOT}"
echo "MODEL=${MODEL}"
echo "NAMESPACE=${NAMESPACE}"
echo "HTTP_PORT=${HTTP_PORT}"
echo "BLOCK_SIZE=${BLOCK_SIZE}"
echo "NATS_SERVER=${NATS_SERVER}"
echo "ETCD_ENDPOINTS=${ETCD_ENDPOINTS}"
echo "VLLM_SYSTEM_PORT=${VLLM_SYSTEM_PORT}"
echo "MM_ROUTER_SYSTEM_PORT=${MM_ROUTER_SYSTEM_PORT}"
echo

PIDS=()

cleanup() {
    echo
    echo "Cleaning up background processes..."
    for pid in "${PIDS[@]:-}"; do
        kill "${pid}" 2>/dev/null || true
    done
    wait 2>/dev/null || true
}
trap cleanup EXIT INT TERM

wait_ready() {
    local url="$1"
    local name="$2"
    local timeout_s="${3:-240}"
    local deadline=$((SECONDS + timeout_s))

    echo "Waiting for ${name} at ${url} ..."
    while (( SECONDS < deadline )); do
        if curl -fsS "${url}" 2>/dev/null | grep -q '"status"[[:space:]]*:[[:space:]]*"ready"'; then
            echo "${name} is ready"
            return 0
        fi
        sleep 1
    done

    echo "Timed out waiting for ${name} (${url})" >&2
    return 1
}

wait_frontend_models() {
    local url="$1"
    local timeout_s="${2:-240}"
    local deadline=$((SECONDS + timeout_s))

    echo "Waiting for frontend models API at ${url} ..."
    while (( SECONDS < deadline )); do
        if curl -fsS "${url}" >/dev/null 2>&1; then
            echo "Frontend is ready"
            return 0
        fi
        sleep 1
    done

    echo "Timed out waiting for frontend (${url})" >&2
    return 1
}

echo "Prerequisite: start etcd and NATS yourself before running this script."
echo "Example:"
echo "  docker compose -f deploy/docker-compose.yml up -d"
echo

COMMON_ENV=(
    "DYN_NAMESPACE=${NAMESPACE}"
    "DYN_REQUEST_PLANE=nats"
    "NATS_SERVER=${NATS_SERVER}"
    "ETCD_ENDPOINTS=${ETCD_ENDPOINTS}"
)

echo
echo "=== Starting vLLM backend worker ==="
# Use an internal served-model-name so frontend traffic goes to the MM router
# (which registers the public model name) instead of directly to the backend.
env "${COMMON_ENV[@]}" \
    "DYN_SYSTEM_PORT=${VLLM_SYSTEM_PORT}" \
    python -m dynamo.vllm \
        --model "${MODEL}" \
        --enable-multimodal \
        --block-size "${BLOCK_SIZE}" \
        --enforce-eager \
        --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
        --max-model-len "${MAX_MODEL_LEN}" \
        --served-model-name "${MODEL}__internal" \
        ${VLLM_EXTRA_ARGS} &
PIDS+=($!)

wait_ready "http://127.0.0.1:${VLLM_SYSTEM_PORT}/health" "vLLM backend" 900

echo
echo "=== Starting vLLM MM Router Worker ==="
env "${COMMON_ENV[@]}" \
    "DYN_LOG=debug" \
    "DYN_SYSTEM_PORT=${MM_ROUTER_SYSTEM_PORT}" \
    'DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS=["generate"]' \
    python -m examples.backends.vllm.mm_router_worker \
        --model "${MODEL}" \
        --namespace "${NAMESPACE}" \
        --component "${MM_ROUTER_COMPONENT}" \
        --endpoint generate \
        --downstream-component "${BACKEND_COMPONENT}" \
        --downstream-endpoint generate \
        --block-size "${BLOCK_SIZE}" \
        ${MM_ROUTER_EXTRA_ARGS} &
PIDS+=($!)

wait_ready "http://127.0.0.1:${MM_ROUTER_SYSTEM_PORT}/health" "MM router" 300

echo
echo "=== Starting frontend ==="
env "${COMMON_ENV[@]}" \
    "DYN_LOG=info" \
    python -m dynamo.frontend \
        --http-port "${HTTP_PORT}" \
        --router-mode round-robin \
        ${FRONTEND_EXTRA_ARGS} &
PIDS+=($!)

wait_frontend_models "http://127.0.0.1:${HTTP_PORT}/v1/models" 300

echo
echo "=== All services are ready ==="
echo "Frontend:    http://127.0.0.1:${HTTP_PORT}"
echo "MM Router:   http://127.0.0.1:${MM_ROUTER_SYSTEM_PORT}/health"
echo "vLLM backend:http://127.0.0.1:${VLLM_SYSTEM_PORT}/health"
echo
echo "Try the same multimodal request twice and compare MM router logs for:"
echo '  [ROUTING] Best: worker_... with X/Y blocks overlap'
echo
echo "Example:"
echo "  curl http://127.0.0.1:${HTTP_PORT}/v1/chat/completions \\"
echo "    -H 'Content-Type: application/json' \\"
echo "    -d '{\"model\":\"${MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"Describe this image\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"http://images.cocodataset.org/test2017/000000000001.jpg\"}}]}],\"max_tokens\":32}'"
echo
echo "Press Ctrl+C to stop all services"

wait