Unverified Commit fb1b4f92 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

chore: Add vllm multimodal tests to pre_merge (#4889)

parent e83847c0
......@@ -143,6 +143,8 @@ jobs:
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Docker Login
uses: ./.github/actions/docker-login
with:
......
......@@ -18,7 +18,7 @@ from typing import Any, Dict, Optional
import torch
from .model import SupportedModels, is_model_supported
from .model import SupportedModels, is_model_supported, is_qwen_vl_model
logger = logging.getLogger(__name__)
......@@ -88,7 +88,7 @@ def encode_image_embeddings(
embeddings = projector(vision_outputs.last_hidden_state)
elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
elif is_qwen_vl_model(model_name):
embeddings = get_qwen_image_features(vision_encoder, image_embeds)
else:
......@@ -123,7 +123,7 @@ def get_encoder_components(
projector = getattr(vision_model, "multi_modal_projector", None)
return vision_encoder, projector
elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
elif is_qwen_vl_model(model_name):
vision_encoder = vision_model
projector = None
return vision_encoder, projector
......
......@@ -27,6 +27,7 @@ class SupportedModels:
"""Supported multimodal model identifiers"""
LLAVA_1_5_7B = "llava-hf/llava-1.5-7b-hf"
QWEN_2_VL_2B = "Qwen/Qwen2-VL-2B-Instruct"
QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf"
......@@ -100,6 +101,28 @@ def is_model_supported(model_name: str, supported_model: str) -> bool:
return normalized_name == normalized_supported
# List of all Qwen VL model variants for easy extension
QWEN_VL_MODELS = [
SupportedModels.QWEN_2_VL_2B,
SupportedModels.QWEN_2_5_VL_7B,
]
def is_qwen_vl_model(model_name: str) -> bool:
"""
Check if a model is any Qwen VL variant.
Args:
model_name: The model name to check
Returns:
True if the model is a Qwen VL variant, False otherwise
"""
return any(
is_model_supported(model_name, qwen_model) for qwen_model in QWEN_VL_MODELS
)
def load_vision_model(model_id: str) -> torch.nn.Module:
"""
Load a vision model from a HuggingFace model ID.
......@@ -132,7 +155,7 @@ def construct_mm_data(
image_embeds = image_embeds.to(embeddings_dtype)
# Model-specific image handling
if is_model_supported(model, SupportedModels.QWEN_2_5_VL_7B):
if is_qwen_vl_model(model):
return _construct_qwen_image_data(image_embeds, image_grid_thw)
else:
# Default image handling for other models (e.g., LLAVA_1_5_7B)
......
......@@ -19,6 +19,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
PROVIDED_PROMPT_TEMPLATE=""
SINGLE_GPU=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
......@@ -31,11 +32,16 @@ while [[ $# -gt 0 ]]; do
PROVIDED_PROMPT_TEMPLATE=$2
shift 2
;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo " --single-gpu Run both encode and PD workers on GPU 0 (for pre-merge CI)"
echo " -h, --help Show this help message"
exit 0
;;
......@@ -54,7 +60,7 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]] || [[ "$MODEL_NAME" == "Qwen/Qwen2-VL-2B-Instruct" ]]; then
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
......@@ -67,11 +73,14 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
# Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.3 --max-model-len 3072 --enforce-eager"
else
# Multi-GPU mode: standard memory settings
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
fi
......@@ -79,8 +88,15 @@ fi
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
# Use single GPU (GPU 0) for pre-merge CI, otherwise use GPU 0 for encode and GPU 1 for PD
if [[ "$SINGLE_GPU" == "true" ]]; then
# Single GPU mode: both workers share GPU 0 with reduced memory
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
else
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
fi
# Wait for all background processes to complete
wait
......@@ -79,7 +79,6 @@ python -m dynamo.frontend &
echo "Starting processor..."
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
# Start encode worker
......
......@@ -227,6 +227,32 @@ vllm_configs = {
completion_payload_default(),
],
),
"multimodal_agg_qwen2vl_2b_epd": VLLMConfig(
name="multimodal_agg_qwen2vl_2b_epd",
directory=vllm_dir,
script_name="agg_multimodal_epd.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen2-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen2-VL-2B-Instruct", "--single-gpu"],
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["purple"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_llava_epd": VLLMConfig(
name="multimodal_agg_llava_epd",
directory=vllm_dir,
......@@ -284,7 +310,7 @@ vllm_configs = {
name="multimodal_agg_qwen",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[pytest.mark.gpu_2, pytest.mark.nightly],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
......@@ -312,7 +338,7 @@ vllm_configs = {
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.gpu_1,
# https://github.com/ai-dynamo/dynamo/issues/4501
pytest.mark.xfail(strict=False),
],
......@@ -374,7 +400,7 @@ vllm_configs = {
name="multimodal_audio_agg",
directory="/workspace/examples/multimodal",
script_name="audio_agg.sh",
marks=[pytest.mark.gpu_2],
marks=[pytest.mark.gpu_2, pytest.mark.nightly],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment