run_example.sh 3.09 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Hierarchical Planner Example
# Run each command in a separate terminal, in order from bottom to top.
# Wait a few seconds between starting each component.

# ============================================================================
# frontend + global_router
# ============================================================================
# need to specify a namespace so that mockers are not registered to frontend
# and cannot use "dynamo" because that is reserved for all namespaces
python -m dynamo.frontend \
  --router-mode round-robin \
  --namespace hierarchical &

python -m dynamo.global_router \
  --config examples/hierarchical_planner/global_router_config.json \
  --model-name Qwen/Qwen3-0.6B \
  --default-ttft-target 100 \
  --default-itl-target 10 \
  --namespace hierarchical &

# ============================================================================
# prefill_pool_0 - local router + mocker worker (prefill)
# ============================================================================
DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
  --endpoint prefill_pool_0.worker.generate \
30
31
  --router-block-size 16 \
  --no-router-track-active-blocks &  # prefill router does not need to track active blocks
32
33
34
35

python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
  --endpoint dyn://prefill_pool_0.worker.generate \
36
  --disaggregation-mode prefill \
37
38
39
40
41
42
43
  --block-size 16 &

# ============================================================================
# prefill_pool_1 - local router + mocker worker (prefill)
# ============================================================================
DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
  --endpoint prefill_pool_1.worker.generate \
44
45
  --router-block-size 16 \
  --no-router-track-active-blocks &  # prefill router does not need to track active blocks
46
47
48
49

python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
  --endpoint dyn://prefill_pool_1.worker.generate \
50
  --disaggregation-mode prefill \
51
52
53
54
55
56
57
  --block-size 16 &

# ============================================================================
# decode_pool_0 - local router + mocker worker (decode)
# ============================================================================
DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
  --endpoint decode_pool_0.worker.generate \
58
59
  --router-block-size 16 \
  --router-kv-overlap-score-weight 0 &
60
61
62
63

python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
  --endpoint dyn://decode_pool_0.worker.generate \
64
  --disaggregation-mode decode \
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  --block-size 16 &

# ============================================================================
# test request
# ============================================================================

# wait for all components to start
# curl -X POST http://localhost:8000/v1/chat/completions \
#   -H "Content-Type: application/json" \
#   -d '{
#     "model": "Qwen/Qwen3-0.6B",
#     "messages": [{"role": "user", "content": "Hello!"}],
#     "max_tokens": 50,
#     "stream": true
#   }'