echo" --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo" --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo" --single-gpu Run both encode and PD workers on GPU 0 (for pre-merge CI)"
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo"No multi-modal prompt template is defined for the model: $MODEL_NAME"
...
...
@@ -67,11 +73,14 @@ fi
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
# Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings