#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -ex # Default values HEAD_NODE=0 MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" EXTRA_ARGS=() # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in --head-node) HEAD_NODE=1 shift 1 ;; --model) MODEL_NAME=$2 shift 2 ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "" echo "Disaggregated multimodal serving with separate Prefill/Decode workers for Llama 4" echo "" echo "Options:" echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker." echo " --model Specify the VLM model to use (default: $MODEL_NAME)" echo " -h, --help Show this help message" echo "" echo "Examples:" echo " # On head node:" echo " $0 --head-node" echo "" echo " # On worker node (requires NATS_SERVER and ETCD_ENDPOINTS pointing to head node):" echo " $0" echo "" exit 0 ;; *) EXTRA_ARGS+=("$1") shift ;; esac done trap 'echo Cleaning up...; kill 0' EXIT HTTP_PORT="${DYN_HTTP_PORT:-8000}" echo "==========================================" echo "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" echo "==========================================" echo "Model: $MODEL_NAME" if [[ $HEAD_NODE -eq 1 ]]; then echo "Frontend: http://localhost:$HTTP_PORT" fi echo "==========================================" if [[ $HEAD_NODE -eq 1 ]]; then echo "" echo "Example test command:" echo "" echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\" echo " -H 'Content-Type: application/json' \\" echo " -d '{" echo " \"model\": \"${MODEL_NAME}\"," echo " \"messages\": [{" echo " \"role\": \"user\"," echo " \"content\": [" echo " {\"type\": \"text\", \"text\": \"Describe the image.\"}," echo " {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}" echo " ]" echo " }]," echo " \"max_tokens\": 50" echo " }'" echo "" fi echo "==========================================" # Use TCP transport to avoid NATS payload limits for multimodal export DYN_REQUEST_PLANE=tcp # Configure model-specific args MODEL_SPECIFIC_ARGS="" if [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80" fi if [[ $HEAD_NODE -eq 1 ]]; then # run ingress # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend & # run processor (CPU-only to avoid competing for GPU memory with workers) CUDA_VISIBLE_DEVICES="" \ python -m dynamo.vllm --route-to-encoder --enable-multimodal --model $MODEL_NAME & # Prefill worker handles prompt processing and image encoding # Uses all 8 GPUs for tensor-parallel CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ python -m dynamo.vllm \ --enable-multimodal \ --model $MODEL_NAME \ --disaggregation-mode prefill \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ $MODEL_SPECIFIC_ARGS \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \ "${EXTRA_ARGS[@]}" & else # run decode worker on non-head node # Uses all 8 GPUs for tensor-parallel CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ python -m dynamo.vllm \ --enable-multimodal \ --model $MODEL_NAME \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ $MODEL_SPECIFIC_ARGS \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \ "${EXTRA_ARGS[@]}" & fi # Wait for all background processes to complete wait