Unverified Commit b1930a61 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

chore: migrate vllm e/p/d test from gpu_2 -> gpu_1 (#6638)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
parent f1d5c95a
...@@ -7,6 +7,16 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -7,6 +7,16 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Default values # Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf" MODEL_NAME="llava-hf/llava-1.5-7b-hf"
# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It reduces performance by:
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU=false
# Parse command line arguments # Parse command line arguments
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -14,6 +24,10 @@ while [[ $# -gt 0 ]]; do ...@@ -14,6 +24,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME=$2 MODEL_NAME=$2
shift 2 shift 2
;; ;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "" echo ""
...@@ -22,12 +36,14 @@ while [[ $# -gt 0 ]]; do ...@@ -22,12 +36,14 @@ while [[ $# -gt 0 ]]; do
echo "Options:" echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)" echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates" echo " LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo " --single-gpu Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " $0 --model llava-hf/llava-1.5-7b-hf" echo " $0 --model llava-hf/llava-1.5-7b-hf"
echo " $0 --model microsoft/Phi-3.5-vision-instruct" echo " $0 --model microsoft/Phi-3.5-vision-instruct"
echo " $0 --model Qwen/Qwen2.5-VL-7B-Instruct" echo " $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
echo " $0 --model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
echo "" echo ""
exit 0 exit 0
;; ;;
...@@ -41,7 +57,7 @@ done ...@@ -41,7 +57,7 @@ done
echo "==================================================" echo "=================================================="
echo "Disaggregated Multimodal Serving" echo "Disaggregated Multimodal Serving (E + P + D)"
echo "==================================================" echo "=================================================="
echo "Model: $MODEL_NAME" echo "Model: $MODEL_NAME"
echo "==================================================" echo "=================================================="
...@@ -53,6 +69,7 @@ echo "Starting frontend..." ...@@ -53,6 +69,7 @@ echo "Starting frontend..."
python -m dynamo.frontend & python -m dynamo.frontend &
EXTRA_ARGS="" EXTRA_ARGS=""
PD_EXTRA_ARGS=""
# GPU assignments (override via environment variables) # GPU assignments (override via environment variables)
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0} DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
...@@ -64,6 +81,17 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9} ...@@ -64,6 +81,17 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9} DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9} DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
# OOM during profiling when 3 workers share a GPU. 512 MB covers the
# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--enforce-eager"
PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
fi
# Start encode worker # Start encode worker
echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..." echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
...@@ -71,12 +99,12 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py ...@@ -71,12 +99,12 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
# Start prefill worker (also handles encode routing via --route-to-encoder) # Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..." echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker # Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..." echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=$DYN_DECODE_WORKER_GPU python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' & CUDA_VISIBLE_DEVICES=$DYN_DECODE_WORKER_GPU python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
echo "==================================================" echo "=================================================="
echo "All components started. Waiting for initialization..." echo "All components started. Waiting for initialization..."
...@@ -84,4 +112,3 @@ echo "==================================================" ...@@ -84,4 +112,3 @@ echo "=================================================="
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
...@@ -276,6 +276,7 @@ vllm_configs = { ...@@ -276,6 +276,7 @@ vllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_disagg_qwen3vl_2b_e_pd": VLLMConfig( "multimodal_disagg_qwen3vl_2b_e_pd": VLLMConfig(
name="multimodal_disagg_qwen3vl_2b_e_pd", name="multimodal_disagg_qwen3vl_2b_e_pd",
directory=vllm_dir, directory=vllm_dir,
...@@ -335,20 +336,22 @@ vllm_configs = { ...@@ -335,20 +336,22 @@ vllm_configs = {
) )
], ],
), ),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_disagg_qwen3vl_2b_epd": VLLMConfig( "multimodal_disagg_qwen3vl_2b_epd": VLLMConfig(
name="multimodal_disagg_qwen3vl_2b_epd", name="multimodal_disagg_qwen3vl_2b_epd",
directory=vllm_dir, directory=vllm_dir,
script_name="disagg_multimodal_epd.sh", script_name="disagg_multimodal_epd.sh",
marks=[pytest.mark.gpu_2, pytest.mark.pre_merge], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-VL-2B-Instruct", model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"], script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=360,
env={ env={
"DYN_ENCODE_WORKER_GPU": "0", "DYN_ENCODE_WORKER_GPU": "0",
"DYN_PREFILL_WORKER_GPU": "0", "DYN_PREFILL_WORKER_GPU": "0",
"DYN_DECODE_WORKER_GPU": "1", "DYN_DECODE_WORKER_GPU": "0",
"DYN_ENCODE_GPU_MEM": "0.4", "DYN_ENCODE_GPU_MEM": "0.1",
"DYN_PREFILL_GPU_MEM": "0.4", "DYN_PREFILL_GPU_MEM": "0.4",
"DYN_DECODE_GPU_MEM": "0.85", "DYN_DECODE_GPU_MEM": "0.4",
}, },
request_payloads=[ request_payloads=[
chat_payload( chat_payload(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment