help="Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars (e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.",
choices=["etcd","file","mem"],
)
add_argument(
g,
flag_name="--request-plane",
env_var="DYN_REQUEST_PLANE",
default="tcp",
help="Determines how requests are distributed from routers to workers. 'tcp' is fastest.",
choices=["tcp","nats","http"],
)
add_argument(
g,
flag_name="--event-plane",
env_var="DYN_EVENT_PLANE",
default="nats",
help="Determines how events are published.",
choices=["nats","zmq"],
)
add_argument(
g,
flag_name="--connector",
env_var="DYN_CONNECTOR",
default=["nixl"],
help="List of connectors to use in order (e.g., --connector nixl lmcache). Options: nixl, lmcache, kvbm, null, none. Order will be preserved in MultiConnector.",
nargs="*",
)
add_negatable_bool_argument(
g,
flag_name="--durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS",
default=False,
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
# Optional: tool/reasoning parsers (choices from dynamo._core when available)
# To avoid name conflicts with different backends, prefix "dyn-" for dynamo specific args
add_argument(
g,
flag_name="--dyn-tool-call-parser",
env_var="DYN_TOOL_CALL_PARSER",
default=None,
help="Tool call parser name for the model.",
choices=get_tool_parser_names(),
)
add_argument(
g,
flag_name="--dyn-reasoning-parser",
env_var="DYN_REASONING_PARSER",
default=None,
help="Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
choices=get_reasoning_parser_names(),
)
add_argument(
g,
flag_name="--custom-jinja-template",
env_var="DYN_CUSTOM_JINJA_TEMPLATE",
default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
)
add_argument(
g,
flag_name="--endpoint-types",
env_var="DYN_ENDPOINT_TYPES",
default="chat,completions",
obsolete_flag="--dyn-endpoint-types",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Use 'completions' for models without chat templates.",
)
add_argument(
g,
flag_name="--dump-config-to",
env_var="DYN_DUMP_CONFIG_TO",
default=None,
help="Dump resolved configuration to the specified file path.",
help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
)
parser.add_argument(
"--is-decode-worker",
action="store_true",
help="Mark this as a decode worker which does not publish KV events.",
)
parser.add_argument(
"--connector",
nargs="*",
default=["nixl"],
help="List of connectors to use in order (e.g., --connector nixl lmcache). "
"Options: nixl, lmcache, kvbm, null, none. Default: nixl. Order will be preserved in MultiConnector.",
)
# To avoid name conflicts with different backends, adopted prefix "dyn-" for dynamo specific args
parser.add_argument(
"--dyn-tool-call-parser",
type=str,
default=None,
choices=get_tool_parser_names(),
help="Tool call parser name for the model.",
)
parser.add_argument(
"--dyn-reasoning-parser",
type=str,
default=None,
choices=get_reasoning_parser_names(),
help="Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
)
parser.add_argument(
"--custom-jinja-template",
type=str,
default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
)
parser.add_argument(
"--dyn-endpoint-types",
type=str,
default="chat,completions",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
)
parser.add_argument(
"--multimodal-processor",
action="store_true",
help="Run as multimodal processor component for handling multimodal requests",
)
parser.add_argument(
"--ec-processor",
action="store_true",
help="Run as ECConnector processor (routes multimodal requests to encoder then PD workers)",
)
parser.add_argument(
"--multimodal-encode-worker",
action="store_true",
help="Run as multimodal encode worker component for processing images/videos",
)
parser.add_argument(
"--multimodal-worker",
action="store_true",
help="Run as multimodal worker component for LLM inference with multimodal data",
)
parser.add_argument(
"--multimodal-decode-worker",
action="store_true",
help="Run as multimodal decode worker in disaggregated mode",
)
parser.add_argument(
"--multimodal-encode-prefill-worker",
action="store_true",
help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)",
)
parser.add_argument(
"--enable-multimodal",
action="store_true",
help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
)
parser.add_argument(
"--mm-prompt-template",
type=str,
default="USER: <image>\n<prompt> ASSISTANT:",
help=(
"Different multi-modal models expect the prompt to contain different special media prompts. "
"The processor will use this argument to construct the final prompt. "
"User prompt will replace '<prompt>' in the provided template. "
"For example, if the user prompt is 'please describe the image' and the prompt template is "
"'USER: <image> <prompt> ASSISTANT:', the resulting prompt is "
"'USER: <image> please describe the image ASSISTANT:'."
),
)
parser.add_argument(
"--frontend-decoding",
action="store_true",
help=(
"Enable frontend decoding of multimodal images. "
"When enabled, images are decoded in the Rust frontend and transferred to the backend via NIXL RDMA. "
"Without this flag, images are decoded in the Python backend (default behavior)."
),
)
parser.add_argument(
"--vllm-native-encoder-worker",
action="store_true",
help="Run as vLLM-native encoder worker using ECConnector for encoder disaggregation (requires shared storage). The following flags only work when this flag is enabled: --ec-connector-backend, --ec-storage-path, --ec-extra-config, --ec-consumer-mode.",
)
parser.add_argument(
"--ec-connector-backend",
type=str,
default="ECExampleConnector",
help="ECConnector implementation class for encoder disaggregation. Default: ECExampleConnector (disk-based)",
)
parser.add_argument(
"--ec-storage-path",
type=str,
default=None,
help="Storage path for ECConnector (required for ECExampleConnector, optional for other backends)",
)
parser.add_argument(
"--ec-extra-config",
type=str,
default=None,
help="Additional ECConnector configuration as JSON string",
)
parser.add_argument(
"--ec-consumer-mode",
action="store_true",
help="Configure as ECConnector consumer for receiving encoder embeddings (for PD workers)",
)
parser.add_argument(
"--omni",
action="store_true",
help="Run as vLLM-Omni worker for multi-stage pipelines (supports text-to-text, text-to-image, etc.)",
)
parser.add_argument(
"--stage-configs-path",
type=str,
default=None,
help="Path to vLLM-Omni stage configuration YAML file for --omni mode (optional).",
)
parser.add_argument(
"--store-kv",
type=str,
choices=["etcd","file","mem"],
default=os.environ.get("DYN_STORE_KV","etcd"),
help="Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars (e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.",
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
)
parser.add_argument(
"--use-vllm-tokenizer",
# Build argument parser
action="store_true",
dynamo_runtime_argspec.add_arguments(parser)
default=False,
dynamo_vllm_argspec.add_arguments(parser)
help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
# trick to add vllm engine flags to a specific group without breaking the Dynamo groups.
vg=parser.add_argument_group(
"vLLM Engine Options. Please refer to vLLM documentation for more details."
help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
)
add_negatable_bool_argument(
g,
flag_name="--is-decode-worker",
env_var="DYN_VLLM_IS_DECODE_WORKER",
default=False,
help="Mark this as a decode worker which does not publish KV events",
)
add_negatable_bool_argument(
g,
flag_name="--use-vllm-tokenizer",
env_var="DYN_VLLM_USE_TOKENIZER",
default=False,
help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
)
add_argument(
g,
flag_name="--sleep-mode-level",
env_var="DYN_VLLM_SLEEP_MODE_LEVEL",
default=1,
help="Sleep mode level (1=offload to CPU, 2=discard weights, 3=discard all).",
choices=[1,2,3],
arg_type=int,
)
# Multimodal
add_negatable_bool_argument(
g,
flag_name="--multimodal-processor",
env_var="DYN_VLLM_MULTIMODAL_PROCESSOR",
default=False,
help="Run as multimodal processor component for handling multimodal requests.",
)
add_negatable_bool_argument(
g,
flag_name="--ec-processor",
env_var="DYN_VLLM_EC_PROCESSOR",
default=False,
help="Run as ECConnector processor (routes multimodal requests to encoder then PD workers).",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-encode-worker",
env_var="DYN_VLLM_MULTIMODAL_ENCODE_WORKER",
default=False,
help="Run as multimodal encode worker component for processing images/videos.",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-worker",
env_var="DYN_VLLM_MULTIMODAL_WORKER",
default=False,
help="Run as multimodal worker component for LLM inference with multimodal data.",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-decode-worker",
env_var="DYN_VLLM_MULTIMODAL_DECODE_WORKER",
default=False,
help="Run as multimodal decode worker in disaggregated mode.",
help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4).",
)
add_negatable_bool_argument(
g,
flag_name="--enable-multimodal",
env_var="DYN_VLLM_ENABLE_MULTIMODAL",
default=False,
help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
)
add_argument(
g,
flag_name="--mm-prompt-template",
env_var="DYN_VLLM_MM_PROMPT_TEMPLATE",
default="USER: <image>\n<prompt> ASSISTANT:",
help=(
"Different multi-modal models expect the prompt to contain different special media prompts. "
"The processor will use this argument to construct the final prompt. "
"User prompt will replace '<prompt>' in the provided template. "
"For example, if the user prompt is 'please describe the image' and the prompt template is "
"'USER: <image> <prompt> ASSISTANT:', the resulting prompt is "
"'USER: <image> please describe the image ASSISTANT:'."
),
)
add_negatable_bool_argument(
g,
flag_name="--frontend-decoding",
env_var="DYN_VLLM_FRONTEND_DECODING",
default=False,
help=(
"Enable frontend decoding of multimodal images. "
"When enabled, images are decoded in the Rust frontend and transferred to the backend via NIXL RDMA. "
"Without this flag, images are decoded in the Python backend (default behavior)."
),
)
# vLLM-native encoder (ECConnector)
add_negatable_bool_argument(
g,
flag_name="--vllm-native-encoder-worker",
env_var="DYN_VLLM_NATIVE_ENCODER_WORKER",
default=False,
help="Run as vLLM-native encoder worker using ECConnector for encoder disaggregation (requires shared storage). The following flags only work when this flag is enabled: --ec-connector-backend, --ec-storage-path, --ec-extra-config, --ec-consumer-mode.",
)
add_argument(
g,
flag_name="--ec-connector-backend",
env_var="DYN_VLLM_EC_CONNECTOR_BACKEND",
default="ECExampleConnector",
help="ECConnector implementation class for encoder disaggregation.",
)
add_argument(
g,
flag_name="--ec-storage-path",
env_var="DYN_VLLM_EC_STORAGE_PATH",
default=None,
help="Storage path for ECConnector (required for ECExampleConnector, optional for other backends).",
)
add_argument(
g,
flag_name="--ec-extra-config",
env_var="DYN_VLLM_EC_EXTRA_CONFIG",
default=None,
help="Additional ECConnector configuration as JSON string.",
)
add_negatable_bool_argument(
g,
flag_name="--ec-consumer-mode",
env_var="DYN_VLLM_EC_CONSUMER_MODE",
default=False,
help="Configure as ECConnector consumer for receiving encoder embeddings (for PD workers).",
)
# vLLM-Omni
add_negatable_bool_argument(
g,
flag_name="--omni",
env_var="DYN_VLLM_OMNI",
default=False,
help="Run as vLLM-Omni worker for multi-stage pipelines (supports text-to-text, text-to-image, etc.).",
)
add_argument(
g,
flag_name="--stage-configs-path",
env_var="DYN_VLLM_STAGE_CONFIGS_PATH",
default=None,
help="Path to vLLM-Omni stage configuration YAML file for --omni mode (optional).",
)
# @dataclass()
classDynamoVllmConfig(ConfigBase):
"""Configuration for Dynamo vLLM wrapper (vLLM-specific only). All fields optional."""