Unverified Commit a72f41f6 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Update vllm multimodal PD Disagg launch script (#5951)

parent 6b70d845
...@@ -5,6 +5,8 @@ set -ex ...@@ -5,6 +5,8 @@ set -ex
# Default values # Default values
HEAD_NODE=0 HEAD_NODE=0
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
EXTRA_ARGS=()
# Parse command line arguments # Parse command line arguments
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
...@@ -13,6 +15,10 @@ while [[ $# -gt 0 ]]; do ...@@ -13,6 +15,10 @@ while [[ $# -gt 0 ]]; do
HEAD_NODE=1 HEAD_NODE=1
shift 1 shift 1
;; ;;
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "" echo ""
...@@ -20,6 +26,7 @@ while [[ $# -gt 0 ]]; do ...@@ -20,6 +26,7 @@ while [[ $# -gt 0 ]]; do
echo "" echo ""
echo "Options:" echo "Options:"
echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker." echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker."
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
echo "Examples:" echo "Examples:"
...@@ -32,32 +39,54 @@ while [[ $# -gt 0 ]]; do ...@@ -32,32 +39,54 @@ while [[ $# -gt 0 ]]; do
exit 0 exit 0
;; ;;
*) *)
echo "Unknown option: $1" EXTRA_ARGS+=("$1")
echo "Use --help for usage information" shift
exit 1
;; ;;
esac esac
done done
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" # Use TCP transport to avoid NATS payload limits for multimodal
export DYN_REQUEST_PLANE=tcp
# Configure model-specific args
MODEL_SPECIFIC_ARGS=""
if [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then
MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80"
fi
if [[ $HEAD_NODE -eq 1 ]]; then if [[ $HEAD_NODE -eq 1 ]]; then
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# run processor # run processor (CPU-only to avoid competing for GPU memory with workers)
CUDA_VISIBLE_DEVICES="" \
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME &
# Llama 4 doesn't support image embedding input, so the prefill worker will also # Prefill worker handles prompt processing and image encoding
# handle image encoding inline. # Uses all 8 GPUs for tensor-parallel
# run prefill worker CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
--is-prefill-worker \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" &
else else
# run decode worker on non-head node # run decode worker on non-head node
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & # Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \
"${EXTRA_ARGS[@]}" &
fi fi
# Wait for all background processes to complete # Wait for all background processes to complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment