Unverified Commit fb1b4f92 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

chore: Add vllm multimodal tests to pre_merge (#4889)

parent e83847c0
...@@ -143,6 +143,8 @@ jobs: ...@@ -143,6 +143,8 @@ jobs:
echo ${K8S_NODE_NAME} echo ${K8S_NODE_NAME}
- name: Checkout code - name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Docker Login - name: Docker Login
uses: ./.github/actions/docker-login uses: ./.github/actions/docker-login
with: with:
......
...@@ -18,7 +18,7 @@ from typing import Any, Dict, Optional ...@@ -18,7 +18,7 @@ from typing import Any, Dict, Optional
import torch import torch
from .model import SupportedModels, is_model_supported from .model import SupportedModels, is_model_supported, is_qwen_vl_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -88,7 +88,7 @@ def encode_image_embeddings( ...@@ -88,7 +88,7 @@ def encode_image_embeddings(
embeddings = projector(vision_outputs.last_hidden_state) embeddings = projector(vision_outputs.last_hidden_state)
elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B): elif is_qwen_vl_model(model_name):
embeddings = get_qwen_image_features(vision_encoder, image_embeds) embeddings = get_qwen_image_features(vision_encoder, image_embeds)
else: else:
...@@ -123,7 +123,7 @@ def get_encoder_components( ...@@ -123,7 +123,7 @@ def get_encoder_components(
projector = getattr(vision_model, "multi_modal_projector", None) projector = getattr(vision_model, "multi_modal_projector", None)
return vision_encoder, projector return vision_encoder, projector
elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B): elif is_qwen_vl_model(model_name):
vision_encoder = vision_model vision_encoder = vision_model
projector = None projector = None
return vision_encoder, projector return vision_encoder, projector
......
...@@ -27,6 +27,7 @@ class SupportedModels: ...@@ -27,6 +27,7 @@ class SupportedModels:
"""Supported multimodal model identifiers""" """Supported multimodal model identifiers"""
LLAVA_1_5_7B = "llava-hf/llava-1.5-7b-hf" LLAVA_1_5_7B = "llava-hf/llava-1.5-7b-hf"
QWEN_2_VL_2B = "Qwen/Qwen2-VL-2B-Instruct"
QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct" QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf" LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf"
...@@ -100,6 +101,28 @@ def is_model_supported(model_name: str, supported_model: str) -> bool: ...@@ -100,6 +101,28 @@ def is_model_supported(model_name: str, supported_model: str) -> bool:
return normalized_name == normalized_supported return normalized_name == normalized_supported
# List of all Qwen VL model variants for easy extension
QWEN_VL_MODELS = [
SupportedModels.QWEN_2_VL_2B,
SupportedModels.QWEN_2_5_VL_7B,
]
def is_qwen_vl_model(model_name: str) -> bool:
"""
Check if a model is any Qwen VL variant.
Args:
model_name: The model name to check
Returns:
True if the model is a Qwen VL variant, False otherwise
"""
return any(
is_model_supported(model_name, qwen_model) for qwen_model in QWEN_VL_MODELS
)
def load_vision_model(model_id: str) -> torch.nn.Module: def load_vision_model(model_id: str) -> torch.nn.Module:
""" """
Load a vision model from a HuggingFace model ID. Load a vision model from a HuggingFace model ID.
...@@ -132,7 +155,7 @@ def construct_mm_data( ...@@ -132,7 +155,7 @@ def construct_mm_data(
image_embeds = image_embeds.to(embeddings_dtype) image_embeds = image_embeds.to(embeddings_dtype)
# Model-specific image handling # Model-specific image handling
if is_model_supported(model, SupportedModels.QWEN_2_5_VL_7B): if is_qwen_vl_model(model):
return _construct_qwen_image_data(image_embeds, image_grid_thw) return _construct_qwen_image_data(image_embeds, image_grid_thw)
else: else:
# Default image handling for other models (e.g., LLAVA_1_5_7B) # Default image handling for other models (e.g., LLAVA_1_5_7B)
......
...@@ -19,6 +19,7 @@ trap 'echo Cleaning up...; kill 0' EXIT ...@@ -19,6 +19,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="llava-hf/llava-1.5-7b-hf" MODEL_NAME="llava-hf/llava-1.5-7b-hf"
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:" PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
PROVIDED_PROMPT_TEMPLATE="" PROVIDED_PROMPT_TEMPLATE=""
SINGLE_GPU=false
# Parse command line arguments # Parse command line arguments
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
...@@ -31,11 +32,16 @@ while [[ $# -gt 0 ]]; do ...@@ -31,11 +32,16 @@ while [[ $# -gt 0 ]]; do
PROVIDED_PROMPT_TEMPLATE=$2 PROVIDED_PROMPT_TEMPLATE=$2
shift 2 shift 2
;; ;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)" echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates." echo " --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo " --single-gpu Run both encode and PD workers on GPU 0 (for pre-merge CI)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
exit 0 exit 0
;; ;;
...@@ -54,7 +60,7 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then ...@@ -54,7 +60,7 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:" PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n" PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]] || [[ "$MODEL_NAME" == "Qwen/Qwen2-VL-2B-Instruct" ]]; then
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n" PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else else
echo "No multi-modal prompt template is defined for the model: $MODEL_NAME" echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
...@@ -67,11 +73,14 @@ fi ...@@ -67,11 +73,14 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments # Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
EXTRA_ARGS="" EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096" EXTRA_ARGS="--gpu-memory-utilization 0.3 --max-model-len 3072 --enforce-eager"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then else
# Multi-GPU mode: standard memory settings
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096" EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
fi fi
...@@ -79,8 +88,15 @@ fi ...@@ -79,8 +88,15 @@ fi
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers # run E/P/D workers
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME & # Use single GPU (GPU 0) for pre-merge CI, otherwise use GPU 0 for encode and GPU 1 for PD
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS & if [[ "$SINGLE_GPU" == "true" ]]; then
# Single GPU mode: both workers share GPU 0 with reduced memory
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
else
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
fi
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
...@@ -79,7 +79,6 @@ python -m dynamo.frontend & ...@@ -79,7 +79,6 @@ python -m dynamo.frontend &
echo "Starting processor..." echo "Starting processor..."
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# Configure GPU memory optimization for specific models
EXTRA_ARGS="" EXTRA_ARGS=""
# Start encode worker # Start encode worker
......
...@@ -227,6 +227,32 @@ vllm_configs = { ...@@ -227,6 +227,32 @@ vllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
"multimodal_agg_qwen2vl_2b_epd": VLLMConfig(
name="multimodal_agg_qwen2vl_2b_epd",
directory=vllm_dir,
script_name="agg_multimodal_epd.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen2-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen2-VL-2B-Instruct", "--single-gpu"],
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["purple"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_llava_epd": VLLMConfig( "multimodal_agg_llava_epd": VLLMConfig(
name="multimodal_agg_llava_epd", name="multimodal_agg_llava_epd",
directory=vllm_dir, directory=vllm_dir,
...@@ -284,7 +310,7 @@ vllm_configs = { ...@@ -284,7 +310,7 @@ vllm_configs = {
name="multimodal_agg_qwen", name="multimodal_agg_qwen",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_multimodal.sh", script_name="agg_multimodal.sh",
marks=[pytest.mark.gpu_2, pytest.mark.nightly], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen2.5-VL-7B-Instruct", model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"], script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0, delayed_start=0,
...@@ -312,7 +338,7 @@ vllm_configs = { ...@@ -312,7 +338,7 @@ vllm_configs = {
directory=vllm_dir, directory=vllm_dir,
script_name="agg_multimodal.sh", script_name="agg_multimodal.sh",
marks=[ marks=[
pytest.mark.gpu_2, pytest.mark.gpu_1,
# https://github.com/ai-dynamo/dynamo/issues/4501 # https://github.com/ai-dynamo/dynamo/issues/4501
pytest.mark.xfail(strict=False), pytest.mark.xfail(strict=False),
], ],
...@@ -374,7 +400,7 @@ vllm_configs = { ...@@ -374,7 +400,7 @@ vllm_configs = {
name="multimodal_audio_agg", name="multimodal_audio_agg",
directory="/workspace/examples/multimodal", directory="/workspace/examples/multimodal",
script_name="audio_agg.sh", script_name="audio_agg.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2, pytest.mark.nightly],
model="Qwen/Qwen2-Audio-7B-Instruct", model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0, delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"], script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment