#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -ex SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" source "$SCRIPT_DIR/../../../common/launch_utils.sh" # Default values HEAD_NODE=0 MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" EXTRA_ARGS=() # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in --head-node) HEAD_NODE=1 shift 1 ;; --model) MODEL_NAME=$2 shift 2 ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "" echo "Disaggregated multimodal serving with separate Prefill/Decode workers for Llama 4" echo "" echo "Options:" echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker." echo " --model Specify the VLM model to use (default: $MODEL_NAME)" echo " -h, --help Show this help message" echo "" echo "Examples:" echo " # On head node:" echo " $0 --head-node" echo "" echo " # On worker node (requires NATS_SERVER and ETCD_ENDPOINTS pointing to head node):" echo " $0" echo "" exit 0 ;; *) EXTRA_ARGS+=("$1") shift ;; esac done trap 'echo Cleaning up...; kill 0' EXIT HTTP_PORT="${DYN_HTTP_PORT:-8000}" if [[ $HEAD_NODE -eq 1 ]]; then print_launch_banner --multimodal "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT" else print_launch_banner --no-curl "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT" fi # Use TCP transport to avoid NATS payload limits for multimodal export DYN_REQUEST_PLANE=tcp # Configure model-specific args GPU_MEM="0.80" KV_BYTES="${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" if [[ -n "$KV_BYTES" ]]; then GPU_MEM_ARGS="--kv-cache-memory-bytes $KV_BYTES --gpu-memory-utilization 0.01" else GPU_MEM_ARGS="--gpu-memory-utilization $GPU_MEM" fi MODEL_SPECIFIC_ARGS="" if [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --max-model-len=208960 $GPU_MEM_ARGS" fi if [[ $HEAD_NODE -eq 1 ]]; then # run ingress # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend & # run processor (CPU-only to avoid competing for GPU memory with workers) CUDA_VISIBLE_DEVICES="" \ python -m dynamo.vllm --route-to-encoder --enable-multimodal --model $MODEL_NAME & # Prefill worker handles prompt processing and image encoding # Uses all 8 GPUs for tensor-parallel CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ python -m dynamo.vllm \ --enable-multimodal \ --model $MODEL_NAME \ --disaggregation-mode prefill \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ $MODEL_SPECIFIC_ARGS \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \ "${EXTRA_ARGS[@]}" & else # run decode worker on non-head node # Uses all 8 GPUs for tensor-parallel CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ python -m dynamo.vllm \ --enable-multimodal \ --model $MODEL_NAME \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ $MODEL_SPECIFIC_ARGS \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \ "${EXTRA_ARGS[@]}" & fi # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest wait_any_exit