multi_node_tp_headless.sh 1.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Single-machine 2-GPU test for multi-node TP with --headless flag.
#
# Launches frontend + head (node-rank=0, GPU 0) + headless worker (node-rank=1, GPU 1)
# on localhost to validate the headless code path without requiring multiple machines.

set -e
trap 'echo "Cleaning up..."; kill 0' EXIT

MODEL="${MODEL:-Qwen/Qwen3-0.6B}"

15
16
17
18
19
KV_BYTES="${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}"
GPU_MEM_ARGS=""
if [[ -n "$KV_BYTES" ]]; then
    GPU_MEM_ARGS="--kv-cache-memory-bytes $KV_BYTES --gpu-memory-utilization 0.01"
fi
20

21
22
23
24
25
26
27
28
29
30
echo "Starting Dynamo frontend..."
python3 -m dynamo.frontend &

echo "Starting dynamo.vllm head node (TP=2, nnodes=2, node-rank=0, GPU 0)..."
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
  --model "${MODEL}" \
  --tensor-parallel-size 2 \
  --nnodes 2 \
  --node-rank 0 \
  --master-addr 127.0.0.1 \
31
  --enforce-eager \
32
  $GPU_MEM_ARGS &
33
34
35
36
37
38
39
40
41

echo "Starting dynamo.vllm headless worker (TP=2, nnodes=2, node-rank=1, GPU 1)..."
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
  --model "${MODEL}" \
  --tensor-parallel-size 2 \
  --nnodes 2 \
  --node-rank 1 \
  --master-addr 127.0.0.1 \
  --enforce-eager \
42
  $GPU_MEM_ARGS \
43
44
45
  --headless &

wait