Unverified Commit 550bf98c authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

feat: Add security flag to MM flow in vllm (#4556)


Co-authored-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
parent ef8cf365
...@@ -60,6 +60,7 @@ class Config: ...@@ -60,6 +60,7 @@ class Config:
multimodal_encode_worker: bool = False multimodal_encode_worker: bool = False
multimodal_worker: bool = False multimodal_worker: bool = False
multimodal_decode_worker: bool = False multimodal_decode_worker: bool = False
enable_multimodal: bool = False
multimodal_encode_prefill_worker: bool = False multimodal_encode_prefill_worker: bool = False
mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:" mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
# dump config to file # dump config to file
...@@ -159,6 +160,11 @@ def parse_args() -> Config: ...@@ -159,6 +160,11 @@ def parse_args() -> Config:
action="store_true", action="store_true",
help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)", help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)",
) )
parser.add_argument(
"--enable-multimodal",
action="store_true",
help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
)
parser.add_argument( parser.add_argument(
"--mm-prompt-template", "--mm-prompt-template",
type=str, type=str,
...@@ -224,6 +230,9 @@ def parse_args() -> Config: ...@@ -224,6 +230,9 @@ def parse_args() -> Config:
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker" "Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
) )
if mm_flags == 1 and not args.enable_multimodal:
raise ValueError("Use --enable-multimodal to enable multimodal processing")
# Set component and endpoint based on worker type # Set component and endpoint based on worker type
if args.multimodal_processor: if args.multimodal_processor:
config.component = "processor" config.component = "processor"
...@@ -262,6 +271,7 @@ def parse_args() -> Config: ...@@ -262,6 +271,7 @@ def parse_args() -> Config:
config.multimodal_worker = args.multimodal_worker config.multimodal_worker = args.multimodal_worker
config.multimodal_decode_worker = args.multimodal_decode_worker config.multimodal_decode_worker = args.multimodal_decode_worker
config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
config.enable_multimodal = args.enable_multimodal
config.mm_prompt_template = args.mm_prompt_template config.mm_prompt_template = args.mm_prompt_template
config.store_kv = args.store_kv config.store_kv = args.store_kv
config.request_plane = args.request_plane config.request_plane = args.request_plane
......
...@@ -85,6 +85,7 @@ class BaseWorkerHandler(ABC): ...@@ -85,6 +85,7 @@ class BaseWorkerHandler(ABC):
engine, engine,
default_sampling_params, default_sampling_params,
model_max_len: int | None = None, model_max_len: int | None = None,
enable_multimodal: bool = False,
): ):
self.runtime = runtime self.runtime = runtime
self.component = component self.component = component
...@@ -95,6 +96,7 @@ class BaseWorkerHandler(ABC): ...@@ -95,6 +96,7 @@ class BaseWorkerHandler(ABC):
self.image_loader = ImageLoader() self.image_loader = ImageLoader()
self.temp_dirs: list[tempfile.TemporaryDirectory] = [] self.temp_dirs: list[tempfile.TemporaryDirectory] = []
self.model_max_len = model_max_len self.model_max_len = model_max_len
self.enable_multimodal = enable_multimodal
@abstractmethod @abstractmethod
async def generate(self, request, context) -> AsyncGenerator[dict, None]: async def generate(self, request, context) -> AsyncGenerator[dict, None]:
...@@ -159,6 +161,13 @@ class BaseWorkerHandler(ABC): ...@@ -159,6 +161,13 @@ class BaseWorkerHandler(ABC):
if "multi_modal_data" not in request or request["multi_modal_data"] is None: if "multi_modal_data" not in request or request["multi_modal_data"] is None:
return None return None
# Security check: reject multimodal data if not explicitly enabled
if not self.enable_multimodal:
raise ValueError(
"Received multimodal data but multimodal processing is not enabled. "
"Use --enable-multimodal flag to enable multimodal processing."
)
mm_map = request["multi_modal_data"] mm_map = request["multi_modal_data"]
vllm_mm_data = {} vllm_mm_data = {}
...@@ -271,9 +280,15 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -271,9 +280,15 @@ class DecodeWorkerHandler(BaseWorkerHandler):
engine, engine,
default_sampling_params, default_sampling_params,
model_max_len: int | None = None, model_max_len: int | None = None,
enable_multimodal: bool = False,
): ):
super().__init__( super().__init__(
runtime, component, engine, default_sampling_params, model_max_len runtime,
component,
engine,
default_sampling_params,
model_max_len,
enable_multimodal,
) )
async def generate(self, request, context): async def generate(self, request, context):
...@@ -339,9 +354,15 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -339,9 +354,15 @@ class PrefillWorkerHandler(BaseWorkerHandler):
engine, engine,
default_sampling_params, default_sampling_params,
model_max_len: int | None = None, model_max_len: int | None = None,
enable_multimodal: bool = False,
): ):
super().__init__( super().__init__(
runtime, component, engine, default_sampling_params, model_max_len runtime,
component,
engine,
default_sampling_params,
model_max_len,
enable_multimodal,
) )
async def generate(self, request, context): async def generate(self, request, context):
......
...@@ -38,7 +38,13 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler): ...@@ -38,7 +38,13 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
) )
# Call BaseWorkerHandler.__init__ with proper parameters # Call BaseWorkerHandler.__init__ with proper parameters
super().__init__(runtime, component, engine_client, default_sampling_params) super().__init__(
runtime,
component,
engine_client,
default_sampling_params,
enable_multimodal=config.enable_multimodal,
)
self.config = config self.config = config
self.enable_disagg = config.is_prefill_worker self.enable_disagg = config.is_prefill_worker
...@@ -98,7 +104,13 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -98,7 +104,13 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
) )
# Call BaseWorkerHandler.__init__ with proper parameters # Call BaseWorkerHandler.__init__ with proper parameters
super().__init__(runtime, component, engine_client, default_sampling_params) super().__init__(
runtime,
component,
engine_client,
default_sampling_params,
enable_multimodal=config.enable_multimodal,
)
self.config = config self.config = config
self.decode_worker_client = decode_worker_client self.decode_worker_client = decode_worker_client
......
...@@ -22,6 +22,10 @@ Dynamo supports multimodal models with vLLM v1. In general, multimodal models ca ...@@ -22,6 +22,10 @@ Dynamo supports multimodal models with vLLM v1. In general, multimodal models ca
> [!WARNING] > [!WARNING]
> **LLaVA Model Limitation**: Do not use LLaVA models (e.g., `llava-hf/llava-1.5-7b-hf`) with the standard aggregated serving setup, as they contain keywords that Dynamo cannot yet parse. LLaVA models can still be used with the EPD (Encode-Prefill-Decode) setup described below. > **LLaVA Model Limitation**: Do not use LLaVA models (e.g., `llava-hf/llava-1.5-7b-hf`) with the standard aggregated serving setup, as they contain keywords that Dynamo cannot yet parse. LLaVA models can still be used with the EPD (Encode-Prefill-Decode) setup described below.
> [!IMPORTANT]
> **Security Requirement**: All multimodal workers require the `--enable-multimodal` flag to be explicitly set at startup. This is a security feature to prevent unintended processing of multimodal data from untrusted sources. Workers will fail at startup if multimodal flags (e.g., `--multimodal-worker`, `--multimodal-processor`) are used without `--enable-multimodal`.
This flag is analogus to `--enable-mm-embeds` in vllm serve but also extends it to all multimodal content (url, embeddings, b64).
# Multimodal EPD Deployment Examples # Multimodal EPD Deployment Examples
This section provides example workflows and reference implementations for deploying a multimodal model using Dynamo and vLLM v1 with EPD(Encode-Prefill-Decode) pipeline. This section provides example workflows and reference implementations for deploying a multimodal model using Dynamo and vLLM v1 with EPD(Encode-Prefill-Decode) pipeline.
......
...@@ -60,7 +60,7 @@ fi ...@@ -60,7 +60,7 @@ fi
# --enforce-eager: Quick deployment (remove for production) # --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving # --connector none: No KV transfer needed for aggregated serving
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
......
...@@ -75,11 +75,11 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then ...@@ -75,11 +75,11 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
fi fi
# Start processor (Python-based preprocessing, handles prompt templating) # Start processor (Python-based preprocessing, handles prompt templating)
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers # run E/P/D workers
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME & CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS & CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
...@@ -11,10 +11,10 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ...@@ -11,10 +11,10 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
python -m dynamo.frontend --http-port=8000 & python -m dynamo.frontend --http-port=8000 &
# run processor # run processor
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
# Llama 4 doesn't support image embedding input, so use encode+prefill worker # Llama 4 doesn't support image embedding input, so use encode+prefill worker
# that handles image encoding inline # that handles image encoding inline
python -m dynamo.vllm --multimodal-encode-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 & python -m dynamo.vllm --multimodal-encode-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
...@@ -76,7 +76,7 @@ python -m dynamo.frontend --http-port=8000 & ...@@ -76,7 +76,7 @@ python -m dynamo.frontend --http-port=8000 &
# Start processor # Start processor
echo "Starting processor..." echo "Starting processor..."
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# Configure GPU memory optimization for specific models # Configure GPU memory optimization for specific models
EXTRA_ARGS="" EXTRA_ARGS=""
...@@ -86,17 +86,17 @@ fi ...@@ -86,17 +86,17 @@ fi
# Start encode worker # Start encode worker
echo "Starting encode worker on GPU 1..." echo "Starting encode worker on GPU 1..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
# Start prefill worker # Start prefill worker
echo "Starting prefill worker on GPU 2..." echo "Starting prefill worker on GPU 2..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker # Start decode worker
echo "Starting decode worker on GPU 3..." echo "Starting decode worker on GPU 3..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' & CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
echo "==================================================" echo "=================================================="
echo "All components started. Waiting for initialization..." echo "All components started. Waiting for initialization..."
......
...@@ -48,15 +48,15 @@ if [[ $HEAD_NODE -eq 1 ]]; then ...@@ -48,15 +48,15 @@ if [[ $HEAD_NODE -eq 1 ]]; then
python -m dynamo.frontend --http-port=8000 & python -m dynamo.frontend --http-port=8000 &
# run processor # run processor
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" & python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
# Llama 4 doesn't support image embedding input, so the prefill worker will also # Llama 4 doesn't support image embedding input, so the prefill worker will also
# handle image encoding inline. # handle image encoding inline.
# run prefill worker # run prefill worker
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
else else
# run decode worker on non-head node # run decode worker on non-head node
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
fi fi
# Wait for all background processes to complete # Wait for all background processes to complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment