help="Run as multimodal processor component for handling multimodal requests",
)
parser.add_argument(
"--ec-processor",
action="store_true",
help="Run as ECConnector processor (routes multimodal requests to encoder then PD workers)",
)
parser.add_argument(
"--multimodal-encode-worker",
action="store_true",
...
...
@@ -191,6 +207,34 @@ def parse_args() -> Config:
"'USER: <image> please describe the image ASSISTANT:'."
),
)
parser.add_argument(
"--vllm-native-encoder-worker",
action="store_true",
help="Run as vLLM-native encoder worker using ECConnector for encoder disaggregation (requires shared storage). The following flags only work when this flag is enabled: --ec-connector-backend, --ec-storage-path, --ec-extra-config, --ec-consumer-mode.",
)
parser.add_argument(
"--ec-connector-backend",
type=str,
default="ECExampleConnector",
help="ECConnector implementation class for encoder disaggregation. Default: ECExampleConnector (disk-based)",
)
parser.add_argument(
"--ec-storage-path",
type=str,
default=None,
help="Storage path for ECConnector (required for ECExampleConnector, optional for other backends)",
)
parser.add_argument(
"--ec-extra-config",
type=str,
default=None,
help="Additional ECConnector configuration as JSON string",
)
parser.add_argument(
"--ec-consumer-mode",
action="store_true",
help="Configure as ECConnector consumer for receiving encoder embeddings (for PD workers)",
)
parser.add_argument(
"--store-kv",
type=str,
...
...
@@ -271,27 +315,42 @@ def parse_args() -> Config:
# Check multimodal role exclusivity
mm_flags=(
int(bool(args.multimodal_processor))
+int(bool(args.ec_processor))
+int(bool(args.multimodal_encode_worker))
+int(bool(args.multimodal_worker))
+int(bool(args.multimodal_decode_worker))
+int(bool(args.multimodal_encode_prefill_worker))
+int(bool(args.vllm_native_encoder_worker))
)
ifmm_flags>1:
raiseValueError(
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
"Use only one of --multimodal-processor, --ec-processor, --multimodal-encode-worker, --multimodal-worker, "
"--multimodal-decode-worker, --multimodal-encode-prefill-worker, or --vllm-native-encoder-worker"
)
ifmm_flags==1andnotargs.enable_multimodal:
raiseValueError("Use --enable-multimodal to enable multimodal processing")
# Validate vLLM-native encoder worker config
ifargs.vllm_native_encoder_worker:
if(
args.ec_connector_backend=="ECExampleConnector"
andnotargs.ec_storage_path
):
raiseValueError(
"--ec-storage-path is required when using ECExampleConnector backend. "
"Specify a shared storage path for encoder cache."
> [!NOTE] Disaggregation is currently only confirmed to work with LLaVA. Qwen2.5-VL is not confirmed to be supported.
## ECConnector Serving
ECConnector is vLLM's native connector for transferring multimodal embeddings via an Embedding Cache. The encoder worker acts as a **producer** (writes embeddings), while the PD worker acts as a **consumer** (reads embeddings).
**Client:** Same as [E/PD Serving](#epd-serving-encode-separate)
## Llama 4 Serving
The Llama 4 model family is natively multimodal. Unlike LLaVA, they do not directly consume image embeddings as input (see the [vLLM support matrix](https://docs.vllm.ai/en/latest/models/supported_models.html#text-generation_1)). Therefore, the encoder worker is not used and encoding is done alongside prefill.
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo"No multi-modal prompt template is defined for the model: $MODEL_NAME"
echo"Please provide a prompt template using --prompt-template option."