Unverified Commit 842f0f15 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: VLLM Multimodal minor fixes (#5748)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent c5e30afb
...@@ -50,15 +50,25 @@ done ...@@ -50,15 +50,25 @@ done
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
# Set max model length based on model name
MAX_MODEL_LEN=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
MAX_MODEL_LEN="4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
MAX_MODEL_LEN="2048"
else
MAX_MODEL_LEN="30426"
fi
# Set GPU memory utilization and model length based on deployment mode # Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings # Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings # Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
EXTRA_ARGS="" EXTRA_ARGS=""
if [[ "$SINGLE_GPU" == "true" ]]; then if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len 30426" EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len $MAX_MODEL_LEN"
else else
# Multi-GPU mode: standard memory settings # Multi-GPU mode: standard memory settings
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 30426" EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len $MAX_MODEL_LEN"
fi fi
# Start processor (Python-based preprocessing, handles prompt templating) # Start processor (Python-based preprocessing, handles prompt templating)
......
...@@ -153,7 +153,8 @@ def overwrite_args(config): ...@@ -153,7 +153,8 @@ def overwrite_args(config):
dp_rank = config.engine_args.data_parallel_rank or 0 dp_rank = config.engine_args.data_parallel_rank or 0
defaults = { defaults = {
"task": "generate", # vLLM 0.13+ renamed 'task' to 'runner'
"runner": "generate",
"skip_tokenizer_init": False, "skip_tokenizer_init": False,
"enable_log_requests": False, "enable_log_requests": False,
"enable_prefix_caching": True, "enable_prefix_caching": True,
...@@ -178,4 +179,6 @@ def overwrite_args(config): ...@@ -178,4 +179,6 @@ def overwrite_args(config):
setattr(config.engine_args, key, value) setattr(config.engine_args, key, value)
logger.debug(f" engine_args.{key} = {value}") logger.debug(f" engine_args.{key} = {value}")
else: else:
raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.") logger.debug(
f" Skipping engine_args.{key} (not available in this vLLM version)"
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment