agg_router_replicas.sh 1.49 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0

# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64

# run two routers (different HTTP + system ports)
# Note: use --router-reset-states only on one router to avoid wiping shared state twice.
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_R1:-8091} \
python -m dynamo.frontend \
    --router-mode kv \
    --router-reset-states \
    --http-port ${DYN_HTTP_PORT_R1:-8000} &

DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_R2:-8092} \
python -m dynamo.frontend \
    --router-mode kv \
    --http-port ${DYN_HTTP_PORT_R2:-8001} &

27
# run workers (local indexer is enabled by default, so routers can query on restart)
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --model $MODEL \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --connector none \
    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &

DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model $MODEL \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --connector none \
    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'