launch.sh 3.54 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Launch script for MM Router Worker with TRT-LLM backend
#
# This script starts:
# 1. TRT-LLM workers (standard, with KV event publishing)
# 2. MM Router Worker (computes mm_hash, routes to best worker)
# 3. Frontend (HTTP ingress)

set -e

# Get the directory where this script is located and navigate to dynamo root
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DYNAMO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
cd "$DYNAMO_ROOT"
echo "Working directory: $(pwd)"

# Configuration
MODEL="${MODEL:-Qwen/Qwen2-VL-2B-Instruct}"
MODEL_TYPE="${MODEL_TYPE:-qwen2_vl}"
NAMESPACE="${NAMESPACE:-default}"
BLOCK_SIZE="${BLOCK_SIZE:-32}"
HTTP_PORT="${HTTP_PORT:-8000}"
NUM_WORKERS="${NUM_WORKERS:-1}"

echo "=== MM Router Worker Launch Script ==="
echo "Model: $MODEL"
echo "Model Type: $MODEL_TYPE"
echo "Namespace: $NAMESPACE"
echo "Block Size: $BLOCK_SIZE"
echo "HTTP Port: $HTTP_PORT"
echo "Num Workers: $NUM_WORKERS"
echo ""

# Collect PIDs for cleanup
PIDS=()

cleanup() {
    echo "Cleaning up..."
    for pid in "${PIDS[@]}"; do
        kill "$pid" 2>/dev/null || true
    done
    wait 2>/dev/null
}
trap cleanup EXIT

# Start TRT-LLM workers
# Use a different served-model-name so Frontend routes to MM Router instead
# Use NATS request plane to match MM Router
echo ""
echo "=== Starting TRT-LLM Workers ==="
for i in $(seq 0 $((NUM_WORKERS - 1))); do
    echo "Starting TRT-LLM worker $i..."
    DYN_REQUEST_PLANE=nats python -m dynamo.trtllm \
        --model-path "$MODEL" \
        --served-model-name "${MODEL}__internal" \
        --endpoint "dyn://${NAMESPACE}.trtllm.generate" \
        --modality multimodal \
        --publish-events-and-metrics \
        --kv-block-size "$BLOCK_SIZE" \
        2>&1 | sed "s/^/[trtllm-$i] /" &
    PIDS+=($!)
done

# Wait for workers to initialize
echo "Waiting for TRT-LLM workers to initialize..."
sleep 15

# Start MM Router Worker
# Use NATS request plane to match Frontend
echo ""
echo "=== Starting MM Router Worker ==="
DYN_REQUEST_PLANE=nats python -m examples.backends.trtllm.mm_router_worker \
    --model "$MODEL" \
    --model-type "$MODEL_TYPE" \
    --namespace "$NAMESPACE" \
    --component mm_router \
    --endpoint generate \
    --downstream-component trtllm \
    --downstream-endpoint generate \
    --block-size "$BLOCK_SIZE" \
    2>&1 | sed "s/^/[mm_router] /" &
PIDS+=($!)

# Wait for router to initialize
echo "Waiting for MM Router to initialize..."
sleep 5

# Start Frontend
# Use NATS request plane to match MM Router
echo ""
echo "=== Starting Frontend ==="
DYN_REQUEST_PLANE=nats python -m dynamo.frontend \
    --http-port "$HTTP_PORT" \
    --router-mode round-robin \
    2>&1 | sed "s/^/[frontend] /" &
PIDS+=($!)

echo ""
echo "=== All services started ==="
echo "Frontend available at http://localhost:$HTTP_PORT"
echo ""
echo "Test with:"
echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
echo "  -H 'Content-Type: application/json' \\"
echo "  -d '{"
echo "    \"model\": \"$MODEL\","
echo "    \"messages\": [{"
echo "      \"role\": \"user\","
echo "      \"content\": [{"
echo "        \"type\": \"text\","
echo "        \"text\": \"Describe this image\""
echo "      }, {"
echo "        \"type\": \"image_url\","
echo "        \"image_url\": {\"url\": \"https://example.com/image.jpg\"}"
echo "      }]"
echo "    }],"
echo "    \"max_tokens\": 100"
echo "  }'"
echo ""
echo "Press Ctrl+C to stop all services"

# Wait for all background processes
wait