multi_node_tp_headless.sh 1.27 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Single-machine 2-GPU test for multi-node TP with --headless flag.
#
# Launches frontend + head (node-rank=0, GPU 0) + headless worker (node-rank=1, GPU 1)
# on localhost to validate the headless code path without requiring multiple machines.

set -e
trap 'echo "Cleaning up..."; kill 0' EXIT

MODEL="${MODEL:-Qwen/Qwen3-0.6B}"

15
16
GPU_MEM_FRACTION="${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}"

17
18
19
20
21
22
23
24
25
26
echo "Starting Dynamo frontend..."
python3 -m dynamo.frontend &

echo "Starting dynamo.vllm head node (TP=2, nnodes=2, node-rank=0, GPU 0)..."
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
  --model "${MODEL}" \
  --tensor-parallel-size 2 \
  --nnodes 2 \
  --node-rank 0 \
  --master-addr 127.0.0.1 \
27
28
  --enforce-eager \
  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
29
30
31
32
33
34
35
36
37

echo "Starting dynamo.vllm headless worker (TP=2, nnodes=2, node-rank=1, GPU 1)..."
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
  --model "${MODEL}" \
  --tensor-parallel-size 2 \
  --nnodes 2 \
  --node-rank 1 \
  --master-addr 127.0.0.1 \
  --enforce-eager \
38
  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
39
40
41
  --headless &

wait