Unverified Commit 6d3b92f0 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: remove --connector flag for vLLM backend (LLM-90) (#6450)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent c9ff6235
...@@ -110,8 +110,8 @@ class DynamoRuntimeArgGroup(ArgGroup): ...@@ -110,8 +110,8 @@ class DynamoRuntimeArgGroup(ArgGroup):
g, g,
flag_name="--connector", flag_name="--connector",
env_var="DYN_CONNECTOR", env_var="DYN_CONNECTOR",
default=["nixl"], default=[],
help="List of connectors to use in order (e.g., --connector nixl lmcache). Options: nixl, lmcache, kvbm, null, none. Order will be preserved in MultiConnector.", help="[Deprecated for vLLM] Use --kv-transfer-config instead. For TRT-LLM, options: nixl, lmcache, kvbm, null, none.",
nargs="*", nargs="*",
) )
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import argparse import argparse
import json
import logging import logging
import os import os
import socket import socket
import warnings import warnings
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from vllm.config import KVTransferConfig
from vllm.distributed.kv_events import KVEventsConfig from vllm.distributed.kv_events import KVEventsConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
...@@ -31,7 +31,6 @@ from . import envs ...@@ -31,7 +31,6 @@ from . import envs
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_MODEL = "Qwen/Qwen3-0.6B" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"}
class Config(DynamoRuntimeConfig, DynamoVllmConfig): class Config(DynamoRuntimeConfig, DynamoVllmConfig):
...@@ -54,18 +53,6 @@ class Config(DynamoRuntimeConfig, DynamoVllmConfig): ...@@ -54,18 +53,6 @@ class Config(DynamoRuntimeConfig, DynamoVllmConfig):
DynamoRuntimeConfig.validate(self) DynamoRuntimeConfig.validate(self)
DynamoVllmConfig.validate(self) DynamoVllmConfig.validate(self)
def has_connector(self, connector_name: str) -> bool:
"""
Check if a specific connector is enabled.
Args:
connector_name: Name of the connector to check (e.g., "kvbm", "nixl")
Returns:
True if the connector is in the connector list, False otherwise
"""
return self.connector is not None and connector_name in self.connector
@register_encoder(Config) @register_encoder(Config)
def _preprocess_for_encode_config(config: Config) -> Dict[str, Any]: def _preprocess_for_encode_config(config: Config) -> Dict[str, Any]:
...@@ -199,30 +186,28 @@ def update_dynamo_config_with_engine( ...@@ -199,30 +186,28 @@ def update_dynamo_config_with_engine(
"Please ensure the file exists and the path is correct." "Please ensure the file exists and the path is correct."
) )
normalized = [c.lower() for c in (dynamo_config.connector or [])] # --connector is no longer supported for vLLM. Raise hard error if explicitly set.
invalid = [c for c in normalized if c not in VALID_CONNECTORS] _reject_connector_flag(dynamo_config)
if invalid:
raise ValueError(
f"Invalid connector(s): {', '.join(invalid)}. "
f"Valid options are: {', '.join(sorted(VALID_CONNECTORS))}"
)
# If disaggregation mode is prefill, require explicit --kv-transfer-config
has_kv_transfer_config = ( has_kv_transfer_config = (
hasattr(engine_config, "kv_transfer_config") hasattr(engine_config, "kv_transfer_config")
and engine_config.kv_transfer_config is not None and engine_config.kv_transfer_config is not None
) )
if not normalized or "none" in normalized or "null" in normalized: if (
if len(normalized) > 1: dynamo_config.disaggregation_mode == DisaggregationMode.PREFILL
and not has_kv_transfer_config
):
raise ValueError( raise ValueError(
"'none' and 'null' cannot be combined with other connectors" "--connector is deprecated and the default is no longer nixl. "
"When using --disaggregation-mode prefill, you must explicitly "
"provide --kv-transfer-config. Example:\n"
" --kv-transfer-config "
'\'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\''
) )
# Clear connector list (no longer used for vLLM)
dynamo_config.connector = [] # type: ignore[assignment] dynamo_config.connector = [] # type: ignore[assignment]
else:
if has_kv_transfer_config:
raise ValueError(
"Cannot specify both --kv-transfer-config and --connector flags"
)
dynamo_config.connector = normalized # type: ignore[assignment]
# Validate ModelExpress P2P server URL # Validate ModelExpress P2P server URL
if getattr(engine_config, "load_format", None) in ("mx-source", "mx-target"): if getattr(engine_config, "load_format", None) in ("mx-source", "mx-target"):
...@@ -236,31 +221,7 @@ def update_dynamo_config_with_engine( ...@@ -236,31 +221,7 @@ def update_dynamo_config_with_engine(
def update_engine_config_with_dynamo( def update_engine_config_with_dynamo(
dynamo_config: Config, engine_config: AsyncEngineArgs dynamo_config: Config, engine_config: AsyncEngineArgs
) -> None: ) -> None:
"""Update engine config base on Dynamo config.""" """Update engine config based on Dynamo config."""
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
# processes, avoiding the GIL contention.
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
# and forcing mp can expose race conditions in vLLM's scheduler.
# See: https://github.com/vllm-project/vllm/issues/29369
connector_list = (
[c.lower() for c in dynamo_config.connector] if dynamo_config.connector else []
)
uses_nixl = "nixl" in connector_list
tp_size = getattr(engine_config, "tensor_parallel_size", None) or 1
if (
uses_nixl
and tp_size == 1
and getattr(engine_config, "distributed_executor_backend", None) is None
):
logger.info(
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
"UniProcExecutor GIL contention with NIXL connector"
)
engine_config.distributed_executor_backend = "mp"
if engine_config.enable_prefix_caching is None: if engine_config.enable_prefix_caching is None:
logger.debug( logger.debug(
"--enable-prefix-caching or --no-enable-prefix-caching not specified. " "--enable-prefix-caching or --no-enable-prefix-caching not specified. "
...@@ -274,12 +235,7 @@ def update_engine_config_with_dynamo( ...@@ -274,12 +235,7 @@ def update_engine_config_with_dynamo(
f"Setting reasonable default of {engine_config.block_size} for block_size" f"Setting reasonable default of {engine_config.block_size} for block_size"
) )
if dynamo_config.has_connector("nixl") or ( if _uses_nixl_connector(engine_config):
# Check if the user provided their own kv_transfer_config
getattr(engine_config, "kv_transfer_config", None) is not None
# and the connector is NixlConnector
and engine_config.kv_transfer_config.kv_connector == "NixlConnector"
):
ensure_side_channel_host() ensure_side_channel_host()
defaults = { defaults = {
...@@ -294,9 +250,6 @@ def update_engine_config_with_dynamo( ...@@ -294,9 +250,6 @@ def update_engine_config_with_dynamo(
"disable_log_stats": False, "disable_log_stats": False,
} }
kv_transfer_config = create_kv_transfer_config(dynamo_config, engine_config)
if kv_transfer_config:
defaults["kv_transfer_config"] = kv_transfer_config
kv_cfg = create_kv_events_config(dynamo_config, engine_config) kv_cfg = create_kv_events_config(dynamo_config, engine_config)
defaults["kv_events_config"] = kv_cfg defaults["kv_events_config"] = kv_cfg
dynamo_config.use_kv_events = kv_cfg is not None and kv_cfg.enable_kv_cache_events dynamo_config.use_kv_events = kv_cfg is not None and kv_cfg.enable_kv_cache_events
...@@ -367,53 +320,134 @@ def create_kv_events_config( ...@@ -367,53 +320,134 @@ def create_kv_events_config(
) )
def create_kv_transfer_config( def _uses_nixl_connector(engine_config: AsyncEngineArgs) -> bool:
dynamo_config: Config, engine_config: AsyncEngineArgs """Check if the user-provided --kv-transfer-config uses NixlConnector.
) -> Optional[KVTransferConfig]:
"""Create KVTransferConfig based on user config or connector list.
Handles logging and returns the appropriate config or None. Handles both direct usage (kv_connector="NixlConnector") and nested usage
inside PdConnector (kv_connector_extra_config.connectors contains
"NixlConnector").
""" """
has_user_kv_config = ( kv_cfg = getattr(engine_config, "kv_transfer_config", None)
hasattr(engine_config, "kv_transfer_config") if kv_cfg is None:
and engine_config.kv_transfer_config is not None return False
if kv_cfg.kv_connector == "NixlConnector":
return True
# PdConnector wraps multiple connectors in kv_connector_extra_config.
# Each entry is a dict like {"kv_connector": "NixlConnector", ...}.
if kv_cfg.kv_connector == "PdConnector":
extra = kv_cfg.kv_connector_extra_config or {}
for entry in extra.get("connectors", []):
if isinstance(entry, dict) and entry.get("kv_connector") == "NixlConnector":
return True
return False
def _uses_dynamo_connector(engine_config: AsyncEngineArgs) -> bool:
"""Check if the user-provided --kv-transfer-config uses DynamoConnector (KVBM).
Handles both direct usage and nested usage inside PdConnector.
"""
kv_cfg = getattr(engine_config, "kv_transfer_config", None)
if kv_cfg is None:
return False
if kv_cfg.kv_connector == "DynamoConnector":
return True
if kv_cfg.kv_connector == "PdConnector":
extra = kv_cfg.kv_connector_extra_config or {}
for entry in extra.get("connectors", []):
if (
isinstance(entry, dict)
and entry.get("kv_connector") == "DynamoConnector"
):
return True
return False
def _connector_to_kv_transfer_json(connectors: list[str]) -> str:
"""Convert a legacy --connector list to the equivalent --kv-transfer-config JSON.
Used in error messages to help users migrate.
"""
multi_connectors = []
for conn in connectors:
c = conn.lower()
if c == "lmcache":
multi_connectors.append(
{"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"}
) )
if has_user_kv_config: elif c == "nixl":
logger.info("Using user-provided kv_transfer_config from --kv-transfer-config") multi_connectors.append(
return None {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
if not dynamo_config.connector:
logger.info("Using vLLM defaults for kv_transfer_config")
return None
logger.info(
f"Creating kv_transfer_config from --connector {dynamo_config.connector}"
) )
multi_connectors = [] elif c == "kvbm":
for conn in dynamo_config.connector: multi_connectors.append(
if conn == "lmcache": {
connector_cfg = {"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"}
elif conn == "nixl":
connector_cfg = {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
elif conn == "kvbm":
connector_cfg = {
"kv_connector": "DynamoConnector", "kv_connector": "DynamoConnector",
"kv_connector_module_path": "kvbm.vllm_integration.connector", "kv_connector_module_path": "kvbm.vllm_integration.connector",
"kv_role": "kv_both", "kv_role": "kv_both",
} }
else: )
continue
multi_connectors.append(connector_cfg)
# For single connector, return direct config
if len(multi_connectors) == 1: if len(multi_connectors) == 1:
cfg = multi_connectors[0] return json.dumps(multi_connectors[0])
return KVTransferConfig(**cfg)
return json.dumps(
# For multiple connectors, use PdConnector {
return KVTransferConfig( "kv_connector": "PdConnector",
kv_connector="PdConnector", "kv_role": "kv_both",
kv_role="kv_both", "kv_connector_extra_config": {"connectors": multi_connectors},
kv_connector_extra_config={"connectors": multi_connectors}, "kv_connector_module_path": "kvbm.vllm_integration.connector",
kv_connector_module_path="kvbm.vllm_integration.connector", }
)
def _reject_connector_flag(dynamo_config: Config) -> None:
"""Raise ValueError if --connector was explicitly set (CLI or DYN_CONNECTOR env var).
The --connector flag is no longer supported for the vLLM backend.
Users must use --kv-transfer-config instead.
"""
connector_list = dynamo_config.connector or []
# Check if --connector was explicitly provided via CLI or DYN_CONNECTOR env var
env_connector = os.environ.get("DYN_CONNECTOR")
explicitly_set = bool(connector_list) or (env_connector is not None)
if not explicitly_set:
return
# Normalize: "none"/"null" means no connector
normalized = [c.lower() for c in connector_list]
if normalized and all(c in ("none", "null") for c in normalized):
# --connector none/null: tell user it's no longer needed
raise ValueError(
"--connector is no longer supported for the vLLM backend. "
"'--connector none' is no longer needed — the default is already "
"no connector. Simply remove the --connector flag."
)
# Active connectors: show migration path
if normalized:
equiv = _connector_to_kv_transfer_json(normalized)
raise ValueError(
"--connector is no longer supported for the vLLM backend. "
"Use --kv-transfer-config instead.\n"
f" Equivalent: --kv-transfer-config '{equiv}'"
)
# DYN_CONNECTOR env var set but parsed to empty list
if env_connector is not None:
env_values = [v.strip().lower() for v in env_connector.split() if v.strip()]
if env_values and not all(v in ("none", "null") for v in env_values):
equiv = _connector_to_kv_transfer_json(env_values)
raise ValueError(
"The DYN_CONNECTOR environment variable is no longer supported "
"for the vLLM backend. Use --kv-transfer-config instead.\n"
f" Equivalent: --kv-transfer-config '{equiv}'"
)
raise ValueError(
"The DYN_CONNECTOR environment variable is no longer supported "
"for the vLLM backend. Use --kv-transfer-config instead."
) )
......
...@@ -52,7 +52,7 @@ from dynamo.runtime import DistributedRuntime, Endpoint ...@@ -52,7 +52,7 @@ from dynamo.runtime import DistributedRuntime, Endpoint
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.worker_factory import WorkerFactory from dynamo.vllm.worker_factory import WorkerFactory
from .args import Config, parse_args from .args import Config, _uses_dynamo_connector, parse_args
from .checkpoint_restore import get_checkpoint_config from .checkpoint_restore import get_checkpoint_config
from .constants import DisaggregationMode from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
...@@ -466,9 +466,9 @@ def setup_vllm_engine(config, stat_logger=None): ...@@ -466,9 +466,9 @@ def setup_vllm_engine(config, stat_logger=None):
usage_context = UsageContext.OPENAI_API_SERVER usage_context = UsageContext.OPENAI_API_SERVER
vllm_config = engine_args.create_engine_config(usage_context=usage_context) vllm_config = engine_args.create_engine_config(usage_context=usage_context)
# Set up consolidator endpoints if KVBM is enabled # Set up consolidator endpoints if KVBM (DynamoConnector) is enabled
consolidator_endpoints = None consolidator_endpoints = None
if config.has_connector("kvbm"): if _uses_dynamo_connector(config.engine_args):
try: try:
from kvbm.vllm_integration.consolidator_config import ( from kvbm.vllm_integration.consolidator_config import (
get_consolidator_endpoints, get_consolidator_endpoints,
......
...@@ -3,13 +3,20 @@ ...@@ -3,13 +3,20 @@
"""Unit tests for vLLM backend components.""" """Unit tests for vLLM backend components."""
import json
import re import re
import warnings import warnings
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
import pytest import pytest
from dynamo.vllm.args import parse_args from dynamo.vllm.args import (
_connector_to_kv_transfer_json,
_uses_dynamo_connector,
_uses_nixl_connector,
parse_args,
)
from dynamo.vllm.constants import DisaggregationMode from dynamo.vllm.constants import DisaggregationMode
from dynamo.vllm.tests.conftest import make_cli_args_fixture from dynamo.vllm.tests.conftest import make_cli_args_fixture
...@@ -179,6 +186,8 @@ def test_endpoint_overrides_with_prefill_worker(mock_vllm_cli): ...@@ -179,6 +186,8 @@ def test_endpoint_overrides_with_prefill_worker(mock_vllm_cli):
"dyn://custom.worker.serve", "dyn://custom.worker.serve",
"--disaggregation-mode", "--disaggregation-mode",
"prefill", "prefill",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
) )
config = parse_args() config = parse_args()
assert config.namespace == "custom" assert config.namespace == "custom"
...@@ -198,6 +207,95 @@ def test_endpoint_invalid_format_raises(mock_vllm_cli): ...@@ -198,6 +207,95 @@ def test_endpoint_invalid_format_raises(mock_vllm_cli):
parse_args() parse_args()
# --connector removal tests
def test_connector_nixl_raises_error_with_migration_hint(mock_vllm_cli):
"""Test that --connector nixl raises ValueError with --kv-transfer-config hint."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--connector", "nixl")
with pytest.raises(ValueError, match="--connector is no longer supported"):
parse_args()
def test_connector_none_raises_error(mock_vllm_cli):
"""Test that --connector none raises ValueError telling user it's no longer needed."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--connector", "none")
with pytest.raises(ValueError, match="no longer needed"):
parse_args()
def test_env_var_dyn_connector_raises_error(monkeypatch, mock_vllm_cli):
"""Test that DYN_CONNECTOR env var raises error for vLLM backend."""
monkeypatch.setenv("DYN_CONNECTOR", "nixl")
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B")
with pytest.raises(ValueError, match="no longer supported"):
parse_args()
def test_prefill_worker_without_kv_transfer_config_raises(mock_vllm_cli):
"""Test that --disaggregation-mode prefill without --kv-transfer-config raises ValueError."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--disaggregation-mode", "prefill")
with pytest.raises(ValueError, match="--kv-transfer-config"):
parse_args()
def test_connector_to_kv_transfer_json_single():
"""Test _connector_to_kv_transfer_json returns valid JSON for a single connector."""
result = json.loads(_connector_to_kv_transfer_json(["nixl"]))
assert result == {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
def test_connector_to_kv_transfer_json_multi():
"""Test _connector_to_kv_transfer_json wraps multiple connectors in PdConnector."""
result = json.loads(_connector_to_kv_transfer_json(["kvbm", "nixl"]))
assert result["kv_connector"] == "PdConnector"
nested = result["kv_connector_extra_config"]["connectors"]
nested_names = [c["kv_connector"] for c in nested]
assert "DynamoConnector" in nested_names
assert "NixlConnector" in nested_names
# _uses_nixl_connector / _uses_dynamo_connector tests
def _make_engine_cfg(kv_connector=None, extra_config=None):
"""Build a minimal fake engine config for connector detection tests."""
if kv_connector is None:
return SimpleNamespace(kv_transfer_config=None)
return SimpleNamespace(
kv_transfer_config=SimpleNamespace(
kv_connector=kv_connector,
kv_connector_extra_config=extra_config,
)
)
_PD_KVBM_NIXL = {
"connectors": [
{"kv_connector": "DynamoConnector", "kv_role": "kv_both"},
{"kv_connector": "NixlConnector", "kv_role": "kv_both"},
]
}
def test_uses_nixl_connector_direct_and_nested():
"""Test _uses_nixl_connector for direct, nested-in-PdConnector, and absent cases."""
assert _uses_nixl_connector(_make_engine_cfg("NixlConnector")) is True
assert _uses_nixl_connector(_make_engine_cfg("PdConnector", _PD_KVBM_NIXL)) is True
assert _uses_nixl_connector(_make_engine_cfg("LMCacheConnectorV1")) is False
assert _uses_nixl_connector(_make_engine_cfg()) is False
def test_uses_dynamo_connector_direct_and_nested():
"""Test _uses_dynamo_connector for direct, nested-in-PdConnector, and absent cases."""
assert _uses_dynamo_connector(_make_engine_cfg("DynamoConnector")) is True
assert (
_uses_dynamo_connector(_make_engine_cfg("PdConnector", _PD_KVBM_NIXL)) is True
)
assert _uses_dynamo_connector(_make_engine_cfg("NixlConnector")) is False
assert _uses_dynamo_connector(_make_engine_cfg()) is False
def test_headless_namespace_has_required_fields(mock_vllm_cli): def test_headless_namespace_has_required_fields(mock_vllm_cli):
"""Test that build_headless_namespace produces a Namespace with fields """Test that build_headless_namespace produces a Namespace with fields
required by vLLM's run_headless(), including the api_server_count fallback.""" required by vLLM's run_headless(), including the api_server_count fallback."""
...@@ -235,7 +333,14 @@ def test_disaggregation_mode_default(mock_vllm_cli): ...@@ -235,7 +333,14 @@ def test_disaggregation_mode_default(mock_vllm_cli):
def test_disaggregation_mode_prefill(mock_vllm_cli): def test_disaggregation_mode_prefill(mock_vllm_cli):
"""Test --disaggregation-mode prefill sets correct state.""" """Test --disaggregation-mode prefill sets correct state."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--disaggregation-mode", "prefill") mock_vllm_cli(
"--model",
"Qwen/Qwen3-0.6B",
"--disaggregation-mode",
"prefill",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
)
config = parse_args() config = parse_args()
assert config.disaggregation_mode == DisaggregationMode.PREFILL assert config.disaggregation_mode == DisaggregationMode.PREFILL
assert config.is_prefill_worker is True assert config.is_prefill_worker is True
...@@ -254,7 +359,13 @@ def test_disaggregation_mode_decode(mock_vllm_cli): ...@@ -254,7 +359,13 @@ def test_disaggregation_mode_decode(mock_vllm_cli):
def test_legacy_is_prefill_worker_emits_deprecation(mock_vllm_cli): def test_legacy_is_prefill_worker_emits_deprecation(mock_vllm_cli):
"""Test that --is-prefill-worker still works but emits DeprecationWarning.""" """Test that --is-prefill-worker still works but emits DeprecationWarning."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--is-prefill-worker") mock_vllm_cli(
"--model",
"Qwen/Qwen3-0.6B",
"--is-prefill-worker",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
)
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always") warnings.simplefilter("always")
config = parse_args() config = parse_args()
......
...@@ -161,7 +161,7 @@ vLLM workers are configured through command-line arguments. Key parameters inclu ...@@ -161,7 +161,7 @@ vLLM workers are configured through command-line arguments. Key parameters inclu
- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`) - `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`)
- `--disaggregation-mode <mode>`: Worker role for disaggregated serving. Accepted values: `prefill`, `decode`, `agg` (default) - `--disaggregation-mode <mode>`: Worker role for disaggregated serving. Accepted values: `prefill`, `decode`, `agg` (default)
- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo - `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo
- `--connector`: Specify which kv_transfer_config you want vllm to use `[nixl, lmcache, kvbm, none]`. This is a helper flag which overwrites the engines KVTransferConfig. - `--kv-transfer-config`: JSON string specifying the vLLM KVTransferConfig (e.g., `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'`). See vLLM documentation for details.
- `--enable-prompt-embeds`: **Enable prompt embeddings feature** (opt-in, default: disabled) - `--enable-prompt-embeds`: **Enable prompt embeddings feature** (opt-in, default: disabled)
- **Required for:** Accepting pre-computed prompt embeddings via API - **Required for:** Accepting pre-computed prompt embeddings via API
- **Default behavior:** Prompt embeddings DISABLED - requests with `prompt_embeds` will fail - **Default behavior:** Prompt embeddings DISABLED - requests with `prompt_embeds` will fail
......
...@@ -23,7 +23,7 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t ...@@ -23,7 +23,7 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
| Variable/Flag | Description | Default | Example | | Variable/Flag | Description | Default | Example |
|---------------|-------------|---------|---------| |---------------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` | | `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` | | `--kv-transfer-config` | KV transfer configuration JSON. Use LMCache connector to enable LMCache metrics. | - | `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` |
## Getting Started Quickly ## Getting Started Quickly
...@@ -109,18 +109,18 @@ For the complete and authoritative list of all vLLM metrics, see the [official v ...@@ -109,18 +109,18 @@ For the complete and authoritative list of all vLLM metrics, see the [official v
## LMCache Metrics ## LMCache Metrics
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics. When LMCache is enabled with `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
### Minimum Requirements ### Minimum Requirements
To access LMCache metrics, both of these are required: To access LMCache metrics, both of these are required:
1. `--connector lmcache` - Enables LMCache in vLLM 1. `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` - Enables LMCache in vLLM
2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint 2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
**Example:** **Example:**
```bash ```bash
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
``` ```
### Viewing LMCache Metrics ### Viewing LMCache Metrics
......
...@@ -161,7 +161,7 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f ...@@ -161,7 +161,7 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f
| `--omni` | Enable the vLLM-Omni orchestrator (required for all omni workloads) | | `--omni` | Enable the vLLM-Omni orchestrator (required for all omni workloads) |
| `--output-modalities <modality>` | Output modality: `text`, `image`, or `video` | | `--output-modalities <modality>` | Output modality: `text`, `image`, or `video` |
| `--stage-configs-path <path>` | Path to stage config YAML (optional; vLLM-Omni uses model defaults if omitted) | | `--stage-configs-path <path>` | Path to stage config YAML (optional; vLLM-Omni uses model defaults if omitted) |
| `--connector none` | Disable KV connector (recommended for omni workers) | | _(no `--kv-transfer-config`)_ | KV connector is disabled by default; omit the flag for omni workers |
| `--media-output-fs-url <url>` | Filesystem URL for storing generated media (default: `file:///tmp/dynamo_media`) | | `--media-output-fs-url <url>` | Filesystem URL for storing generated media (default: `file:///tmp/dynamo_media`) |
| `--media-output-http-url <url>` | Base URL for rewriting media paths in responses (optional) | | `--media-output-http-url <url>` | Base URL for rewriting media paths in responses (optional) |
......
...@@ -264,7 +264,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -264,7 +264,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--connector kvbm --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
``` ```
### Enable Metrics for TensorRT-LLM ### Enable Metrics for TensorRT-LLM
...@@ -420,7 +420,7 @@ python -m dynamo.frontend & ...@@ -420,7 +420,7 @@ python -m dynamo.frontend &
DYN_KVBM_CPU_CACHE_GB=10 \ DYN_KVBM_CPU_CACHE_GB=10 \
nsys profile -o /tmp/kvbm-nsys --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 \ nsys profile -o /tmp/kvbm-nsys --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector kvbm python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
``` ```
## See Also ## See Also
......
...@@ -81,7 +81,6 @@ The LoRA system consists of: ...@@ -81,7 +81,6 @@ The LoRA system consists of:
# Start vLLM worker with LoRA flags # Start vLLM worker with LoRA flags
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \ python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \
--connector none \
--enable-lora \ --enable-lora \
--max-lora-rank 64 --max-lora-rank 64
``` ```
......
...@@ -34,11 +34,11 @@ title: FlexKV ...@@ -34,11 +34,11 @@ title: FlexKV
### Enable FlexKV ### Enable FlexKV
Set the `DYNAMO_USE_FLEXKV` environment variable and use the `--connector flexkv` flag: Set the `DYNAMO_USE_FLEXKV` environment variable and use the `--kv-transfer-config` flag:
```bash ```bash
export DYNAMO_USE_FLEXKV=1 export DYNAMO_USE_FLEXKV=1
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
``` ```
## Aggregated Serving ## Aggregated Serving
...@@ -52,7 +52,7 @@ python -m dynamo.frontend & ...@@ -52,7 +52,7 @@ python -m dynamo.frontend &
# Terminal 2: Start vLLM worker with FlexKV # Terminal 2: Start vLLM worker with FlexKV
DYNAMO_USE_FLEXKV=1 \ DYNAMO_USE_FLEXKV=1 \
FLEXKV_CPU_CACHE_GB=32 \ FLEXKV_CPU_CACHE_GB=32 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
``` ```
### With KV-Aware Routing ### With KV-Aware Routing
...@@ -72,7 +72,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_0" \ ...@@ -72,7 +72,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_0" \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--connector flexkv \ --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.2 \ --gpu-memory-utilization 0.2 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
...@@ -83,7 +83,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_1" \ ...@@ -83,7 +83,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_1" \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--connector flexkv \ --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.2 \ --gpu-memory-utilization 0.2 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
``` ```
...@@ -97,7 +97,7 @@ FlexKV can be used with disaggregated prefill/decode serving. The prefill worker ...@@ -97,7 +97,7 @@ FlexKV can be used with disaggregated prefill/decode serving. The prefill worker
python -m dynamo.frontend & python -m dynamo.frontend &
# Terminal 2: Decode worker (without FlexKV) # Terminal 2: Decode worker (without FlexKV)
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl & CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
# Terminal 3: Prefill worker (with FlexKV) # Terminal 3: Prefill worker (with FlexKV)
DYN_VLLM_KV_EVENT_PORT=20081 \ DYN_VLLM_KV_EVENT_PORT=20081 \
...@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=1 \ ...@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=1 \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--connector nixl flexkv --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
``` ```
## Configuration ## Configuration
......
...@@ -20,10 +20,10 @@ This document describes how LMCache is integrated into Dynamo's vLLM backend to ...@@ -20,10 +20,10 @@ This document describes how LMCache is integrated into Dynamo's vLLM backend to
### Configuration ### Configuration
LMCache is enabled using the `--connector lmcache` flag: LMCache is enabled using the `--kv-transfer-config` flag:
```bash ```bash
python -m dynamo.vllm --model <model_name> --connector lmcache python -m dynamo.vllm --model <model_name> --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
``` ```
### Customization ### Customization
...@@ -157,11 +157,11 @@ kv_transfer_config = KVTransferConfig( ...@@ -157,11 +157,11 @@ kv_transfer_config = KVTransferConfig(
## Metrics and Monitoring ## Metrics and Monitoring
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics. When LMCache is enabled with `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
**Requirements to access LMCache metrics:** **Requirements to access LMCache metrics:**
- `--connector lmcache` - Enables LMCache - `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` - Enables LMCache
- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint - `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally - `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally
......
...@@ -26,7 +26,7 @@ title: Integration README ...@@ -26,7 +26,7 @@ title: Integration README
```bash ```bash
# Add installation and usage from existing integration docs # Add installation and usage from existing integration docs
# Example pattern (LMCache): # Example pattern (LMCache):
# python -m dynamo.vllm --model <model> --connector lmcache # python -m dynamo.vllm --model <model> --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
``` ```
## Configuration ## Configuration
......
...@@ -86,7 +86,7 @@ extraPodSpec: ...@@ -86,7 +86,7 @@ extraPodSpec:
- `--enable-prompt-embeds`: Enable prompt embeddings feature - `--enable-prompt-embeds`: Enable prompt embeddings feature
- `--enable-multimodal`: Enable multimodal (vision) support - `--enable-multimodal`: Enable multimodal (vision) support
- `--disaggregation-mode prefill`: Prefill-only mode for disaggregated serving - `--disaggregation-mode prefill`: Prefill-only mode for disaggregated serving
- `--connector [nixl|lmcache|kvbm|none]`: KV transfer backend selection - `--kv-transfer-config '<json>'`: KV transfer backend configuration (e.g., `'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'`)
## Prerequisites ## Prerequisites
......
...@@ -41,5 +41,5 @@ spec: ...@@ -41,5 +41,5 @@ spec:
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
- --connector - --kv-transfer-config
- kvbm - '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
...@@ -44,8 +44,6 @@ spec: ...@@ -44,8 +44,6 @@ spec:
- --tensor-parallel-size - --tensor-parallel-size
- "2" - "2"
- --is-decode-worker - --is-decode-worker
- --connector
- none
- --kv-transfer-config - --kv-transfer-config
- '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": - '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id":
"vllm-disagg-decode-engine-0abc123"}' "vllm-disagg-decode-engine-0abc123"}'
...@@ -73,8 +71,6 @@ spec: ...@@ -73,8 +71,6 @@ spec:
- "2" - "2"
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --connector
- none
- --kv-transfer-config - --kv-transfer-config
- '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": - '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id":
"vllm-disagg-prefill-engine-0abc123"}' "vllm-disagg-prefill-engine-0abc123"}'
...@@ -66,6 +66,5 @@ spec: ...@@ -66,6 +66,5 @@ spec:
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
- --connector - --kv-transfer-config
- kvbm - '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
- nixl
...@@ -66,6 +66,5 @@ spec: ...@@ -66,6 +66,5 @@ spec:
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
- --connector - --kv-transfer-config
- kvbm - '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
- nixl
...@@ -74,8 +74,7 @@ spec: ...@@ -74,8 +74,7 @@ spec:
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
- --connector - --kv-transfer-config
- kvbm - '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
- nixl
- --tensor-parallel-size - --tensor-parallel-size
- "2" - "2"
...@@ -60,8 +60,6 @@ spec: ...@@ -60,8 +60,6 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --connector
- none
- --enable-lora - --enable-lora
- --max-lora-rank - --max-lora-rank
- "64" - "64"
......
...@@ -29,4 +29,4 @@ python -m dynamo.frontend & ...@@ -29,4 +29,4 @@ python -m dynamo.frontend &
# run worker # run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager --connector none "${EXTRA_ARGS[@]}" python -m dynamo.vllm --model "$MODEL" --enforce-eager "${EXTRA_ARGS[@]}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment