Unverified Commit a72f41f6 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Update vllm multimodal PD Disagg launch script (#5951)

parent 6b70d845
......@@ -5,6 +5,8 @@ set -ex
# Default values
HEAD_NODE=0
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
EXTRA_ARGS=()
# Parse command line arguments
while [[ $# -gt 0 ]]; do
......@@ -13,6 +15,10 @@ while [[ $# -gt 0 ]]; do
HEAD_NODE=1
shift 1
;;
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
......@@ -20,6 +26,7 @@ while [[ $# -gt 0 ]]; do
echo ""
echo "Options:"
echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker."
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
......@@ -32,32 +39,54 @@ while [[ $# -gt 0 ]]; do
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
EXTRA_ARGS+=("$1")
shift
;;
esac
done
trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
# Use TCP transport to avoid NATS payload limits for multimodal
export DYN_REQUEST_PLANE=tcp
# Configure model-specific args
MODEL_SPECIFIC_ARGS=""
if [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then
MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80"
fi
if [[ $HEAD_NODE -eq 1 ]]; then
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor
# run processor (CPU-only to avoid competing for GPU memory with workers)
CUDA_VISIBLE_DEVICES="" \
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME &
# Llama 4 doesn't support image embedding input, so the prefill worker will also
# handle image encoding inline.
# run prefill worker
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
# Prefill worker handles prompt processing and image encoding
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
--is-prefill-worker \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" &
else
# run decode worker on non-head node
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \
"${EXTRA_ARGS[@]}" &
fi
# Wait for all background processes to complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment