help="Run as multimodal worker component for LLM inference with multimodal data",
)
parser.add_argument(
"--multimodal-decode-worker",
action="store_true",
help="Run as multimodal decode worker in disaggregated mode",
)
parser.add_argument(
"--multimodal-encode-prefill-worker",
action="store_true",
...
...
@@ -201,11 +207,12 @@ def parse_args() -> Config:
int(bool(args.multimodal_processor))
+int(bool(args.multimodal_encode_worker))
+int(bool(args.multimodal_worker))
+int(bool(args.multimodal_decode_worker))
+int(bool(args.multimodal_encode_prefill_worker))
)
ifmm_flags>1:
raiseValueError(
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, or --multimodal-encode-prefill-worker"
"Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
)
# Set component and endpoint based on worker type
...
...
@@ -218,8 +225,14 @@ def parse_args() -> Config:
elifargs.multimodal_encode_prefill_worker:
config.component="encoder"
config.endpoint="generate"
elifargs.multimodal_decode_worker:
# Uses "decoder" component name because prefill worker connects to "decoder"
# (prefill uses "backend" to receive from encoder)
echo"Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
echo""
echo"Options:"
echo" --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo" --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo" -h, --help Show this help message"
echo" --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo" --prompt-template <template> Specify the multi-modal prompt template to use"
echo" LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"