Unverified Commit 550bf98c authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

feat: Add security flag to MM flow in vllm (#4556)


Co-authored-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
parent ef8cf365
......@@ -60,6 +60,7 @@ class Config:
multimodal_encode_worker: bool = False
multimodal_worker: bool = False
multimodal_decode_worker: bool = False
enable_multimodal: bool = False
multimodal_encode_prefill_worker: bool = False
mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
# dump config to file
......@@ -159,6 +160,11 @@ def parse_args() -> Config:
action="store_true",
help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)",
)
parser.add_argument(
"--enable-multimodal",
action="store_true",
help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
)
parser.add_argument(
"--mm-prompt-template",
type=str,
......@@ -224,6 +230,9 @@ def parse_args() -> Config:
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
)
if mm_flags == 1 and not args.enable_multimodal:
raise ValueError("Use --enable-multimodal to enable multimodal processing")
# Set component and endpoint based on worker type
if args.multimodal_processor:
config.component = "processor"
......@@ -262,6 +271,7 @@ def parse_args() -> Config:
config.multimodal_worker = args.multimodal_worker
config.multimodal_decode_worker = args.multimodal_decode_worker
config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
config.enable_multimodal = args.enable_multimodal
config.mm_prompt_template = args.mm_prompt_template
config.store_kv = args.store_kv
config.request_plane = args.request_plane
......
......@@ -85,6 +85,7 @@ class BaseWorkerHandler(ABC):
engine,
default_sampling_params,
model_max_len: int | None = None,
enable_multimodal: bool = False,
):
self.runtime = runtime
self.component = component
......@@ -95,6 +96,7 @@ class BaseWorkerHandler(ABC):
self.image_loader = ImageLoader()
self.temp_dirs: list[tempfile.TemporaryDirectory] = []
self.model_max_len = model_max_len
self.enable_multimodal = enable_multimodal
@abstractmethod
async def generate(self, request, context) -> AsyncGenerator[dict, None]:
......@@ -159,6 +161,13 @@ class BaseWorkerHandler(ABC):
if "multi_modal_data" not in request or request["multi_modal_data"] is None:
return None
# Security check: reject multimodal data if not explicitly enabled
if not self.enable_multimodal:
raise ValueError(
"Received multimodal data but multimodal processing is not enabled. "
"Use --enable-multimodal flag to enable multimodal processing."
)
mm_map = request["multi_modal_data"]
vllm_mm_data = {}
......@@ -271,9 +280,15 @@ class DecodeWorkerHandler(BaseWorkerHandler):
engine,
default_sampling_params,
model_max_len: int | None = None,
enable_multimodal: bool = False,
):
super().__init__(
runtime, component, engine, default_sampling_params, model_max_len
runtime,
component,
engine,
default_sampling_params,
model_max_len,
enable_multimodal,
)
async def generate(self, request, context):
......@@ -339,9 +354,15 @@ class PrefillWorkerHandler(BaseWorkerHandler):
engine,
default_sampling_params,
model_max_len: int | None = None,
enable_multimodal: bool = False,
):
super().__init__(
runtime, component, engine, default_sampling_params, model_max_len
runtime,
component,
engine,
default_sampling_params,
model_max_len,
enable_multimodal,
)
async def generate(self, request, context):
......
......@@ -38,7 +38,13 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
)
# Call BaseWorkerHandler.__init__ with proper parameters
super().__init__(runtime, component, engine_client, default_sampling_params)
super().__init__(
runtime,
component,
engine_client,
default_sampling_params,
enable_multimodal=config.enable_multimodal,
)
self.config = config
self.enable_disagg = config.is_prefill_worker
......@@ -98,7 +104,13 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
)
# Call BaseWorkerHandler.__init__ with proper parameters
super().__init__(runtime, component, engine_client, default_sampling_params)
super().__init__(
runtime,
component,
engine_client,
default_sampling_params,
enable_multimodal=config.enable_multimodal,
)
self.config = config
self.decode_worker_client = decode_worker_client
......
......@@ -22,6 +22,10 @@ Dynamo supports multimodal models with vLLM v1. In general, multimodal models ca
> [!WARNING]
> **LLaVA Model Limitation**: Do not use LLaVA models (e.g., `llava-hf/llava-1.5-7b-hf`) with the standard aggregated serving setup, as they contain keywords that Dynamo cannot yet parse. LLaVA models can still be used with the EPD (Encode-Prefill-Decode) setup described below.
> [!IMPORTANT]
> **Security Requirement**: All multimodal workers require the `--enable-multimodal` flag to be explicitly set at startup. This is a security feature to prevent unintended processing of multimodal data from untrusted sources. Workers will fail at startup if multimodal flags (e.g., `--multimodal-worker`, `--multimodal-processor`) are used without `--enable-multimodal`.
This flag is analogus to `--enable-mm-embeds` in vllm serve but also extends it to all multimodal content (url, embeddings, b64).
# Multimodal EPD Deployment Examples
This section provides example workflows and reference implementations for deploying a multimodal model using Dynamo and vLLM v1 with EPD(Encode-Prefill-Decode) pipeline.
......
......@@ -60,7 +60,7 @@ fi
# --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving
DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
# Wait for all background processes to complete
wait
......
......@@ -75,11 +75,11 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
fi
# Start processor (Python-based preprocessing, handles prompt templating)
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
# Wait for all background processes to complete
wait
......@@ -11,10 +11,10 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
python -m dynamo.frontend --http-port=8000 &
# run processor
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
# Llama 4 doesn't support image embedding input, so use encode+prefill worker
# that handles image encoding inline
python -m dynamo.vllm --multimodal-encode-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
python -m dynamo.vllm --multimodal-encode-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
# Wait for all background processes to complete
wait
......@@ -76,7 +76,7 @@ python -m dynamo.frontend --http-port=8000 &
# Start processor
echo "Starting processor..."
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
......@@ -86,17 +86,17 @@ fi
# Start encode worker
echo "Starting encode worker on GPU 1..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
# Start prefill worker
echo "Starting prefill worker on GPU 2..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker
echo "Starting decode worker on GPU 3..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
echo "=================================================="
echo "All components started. Waiting for initialization..."
......
......@@ -48,15 +48,15 @@ if [[ $HEAD_NODE -eq 1 ]]; then
python -m dynamo.frontend --http-port=8000 &
# run processor
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
# Llama 4 doesn't support image embedding input, so the prefill worker will also
# handle image encoding inline.
# run prefill worker
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
else
# run decode worker on non-head node
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
fi
# Wait for all background processes to complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment