"docs/vscode:/vscode.git/clone" did not exist on "197f6595892cd78bcb8ce7c3fd76a13e360c996c"
Unverified Commit 6d3b92f0 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: remove --connector flag for vLLM backend (LLM-90) (#6450)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent c9ff6235
......@@ -110,8 +110,8 @@ class DynamoRuntimeArgGroup(ArgGroup):
g,
flag_name="--connector",
env_var="DYN_CONNECTOR",
default=["nixl"],
help="List of connectors to use in order (e.g., --connector nixl lmcache). Options: nixl, lmcache, kvbm, null, none. Order will be preserved in MultiConnector.",
default=[],
help="[Deprecated for vLLM] Use --kv-transfer-config instead. For TRT-LLM, options: nixl, lmcache, kvbm, null, none.",
nargs="*",
)
......
......@@ -2,13 +2,13 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import logging
import os
import socket
import warnings
from typing import Any, Dict, Optional
from vllm.config import KVTransferConfig
from vllm.distributed.kv_events import KVEventsConfig
from vllm.engine.arg_utils import AsyncEngineArgs
......@@ -31,7 +31,6 @@ from . import envs
logger = logging.getLogger(__name__)
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"}
class Config(DynamoRuntimeConfig, DynamoVllmConfig):
......@@ -54,18 +53,6 @@ class Config(DynamoRuntimeConfig, DynamoVllmConfig):
DynamoRuntimeConfig.validate(self)
DynamoVllmConfig.validate(self)
def has_connector(self, connector_name: str) -> bool:
"""
Check if a specific connector is enabled.
Args:
connector_name: Name of the connector to check (e.g., "kvbm", "nixl")
Returns:
True if the connector is in the connector list, False otherwise
"""
return self.connector is not None and connector_name in self.connector
@register_encoder(Config)
def _preprocess_for_encode_config(config: Config) -> Dict[str, Any]:
......@@ -199,30 +186,28 @@ def update_dynamo_config_with_engine(
"Please ensure the file exists and the path is correct."
)
normalized = [c.lower() for c in (dynamo_config.connector or [])]
invalid = [c for c in normalized if c not in VALID_CONNECTORS]
if invalid:
raise ValueError(
f"Invalid connector(s): {', '.join(invalid)}. "
f"Valid options are: {', '.join(sorted(VALID_CONNECTORS))}"
)
# --connector is no longer supported for vLLM. Raise hard error if explicitly set.
_reject_connector_flag(dynamo_config)
# If disaggregation mode is prefill, require explicit --kv-transfer-config
has_kv_transfer_config = (
hasattr(engine_config, "kv_transfer_config")
and engine_config.kv_transfer_config is not None
)
if not normalized or "none" in normalized or "null" in normalized:
if len(normalized) > 1:
if (
dynamo_config.disaggregation_mode == DisaggregationMode.PREFILL
and not has_kv_transfer_config
):
raise ValueError(
"'none' and 'null' cannot be combined with other connectors"
"--connector is deprecated and the default is no longer nixl. "
"When using --disaggregation-mode prefill, you must explicitly "
"provide --kv-transfer-config. Example:\n"
" --kv-transfer-config "
'\'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\''
)
# Clear connector list (no longer used for vLLM)
dynamo_config.connector = [] # type: ignore[assignment]
else:
if has_kv_transfer_config:
raise ValueError(
"Cannot specify both --kv-transfer-config and --connector flags"
)
dynamo_config.connector = normalized # type: ignore[assignment]
# Validate ModelExpress P2P server URL
if getattr(engine_config, "load_format", None) in ("mx-source", "mx-target"):
......@@ -236,31 +221,7 @@ def update_dynamo_config_with_engine(
def update_engine_config_with_dynamo(
dynamo_config: Config, engine_config: AsyncEngineArgs
) -> None:
"""Update engine config base on Dynamo config."""
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
# processes, avoiding the GIL contention.
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
# and forcing mp can expose race conditions in vLLM's scheduler.
# See: https://github.com/vllm-project/vllm/issues/29369
connector_list = (
[c.lower() for c in dynamo_config.connector] if dynamo_config.connector else []
)
uses_nixl = "nixl" in connector_list
tp_size = getattr(engine_config, "tensor_parallel_size", None) or 1
if (
uses_nixl
and tp_size == 1
and getattr(engine_config, "distributed_executor_backend", None) is None
):
logger.info(
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
"UniProcExecutor GIL contention with NIXL connector"
)
engine_config.distributed_executor_backend = "mp"
"""Update engine config based on Dynamo config."""
if engine_config.enable_prefix_caching is None:
logger.debug(
"--enable-prefix-caching or --no-enable-prefix-caching not specified. "
......@@ -274,12 +235,7 @@ def update_engine_config_with_dynamo(
f"Setting reasonable default of {engine_config.block_size} for block_size"
)
if dynamo_config.has_connector("nixl") or (
# Check if the user provided their own kv_transfer_config
getattr(engine_config, "kv_transfer_config", None) is not None
# and the connector is NixlConnector
and engine_config.kv_transfer_config.kv_connector == "NixlConnector"
):
if _uses_nixl_connector(engine_config):
ensure_side_channel_host()
defaults = {
......@@ -294,9 +250,6 @@ def update_engine_config_with_dynamo(
"disable_log_stats": False,
}
kv_transfer_config = create_kv_transfer_config(dynamo_config, engine_config)
if kv_transfer_config:
defaults["kv_transfer_config"] = kv_transfer_config
kv_cfg = create_kv_events_config(dynamo_config, engine_config)
defaults["kv_events_config"] = kv_cfg
dynamo_config.use_kv_events = kv_cfg is not None and kv_cfg.enable_kv_cache_events
......@@ -367,53 +320,134 @@ def create_kv_events_config(
)
def create_kv_transfer_config(
dynamo_config: Config, engine_config: AsyncEngineArgs
) -> Optional[KVTransferConfig]:
"""Create KVTransferConfig based on user config or connector list.
def _uses_nixl_connector(engine_config: AsyncEngineArgs) -> bool:
"""Check if the user-provided --kv-transfer-config uses NixlConnector.
Handles logging and returns the appropriate config or None.
Handles both direct usage (kv_connector="NixlConnector") and nested usage
inside PdConnector (kv_connector_extra_config.connectors contains
"NixlConnector").
"""
has_user_kv_config = (
hasattr(engine_config, "kv_transfer_config")
and engine_config.kv_transfer_config is not None
kv_cfg = getattr(engine_config, "kv_transfer_config", None)
if kv_cfg is None:
return False
if kv_cfg.kv_connector == "NixlConnector":
return True
# PdConnector wraps multiple connectors in kv_connector_extra_config.
# Each entry is a dict like {"kv_connector": "NixlConnector", ...}.
if kv_cfg.kv_connector == "PdConnector":
extra = kv_cfg.kv_connector_extra_config or {}
for entry in extra.get("connectors", []):
if isinstance(entry, dict) and entry.get("kv_connector") == "NixlConnector":
return True
return False
def _uses_dynamo_connector(engine_config: AsyncEngineArgs) -> bool:
"""Check if the user-provided --kv-transfer-config uses DynamoConnector (KVBM).
Handles both direct usage and nested usage inside PdConnector.
"""
kv_cfg = getattr(engine_config, "kv_transfer_config", None)
if kv_cfg is None:
return False
if kv_cfg.kv_connector == "DynamoConnector":
return True
if kv_cfg.kv_connector == "PdConnector":
extra = kv_cfg.kv_connector_extra_config or {}
for entry in extra.get("connectors", []):
if (
isinstance(entry, dict)
and entry.get("kv_connector") == "DynamoConnector"
):
return True
return False
def _connector_to_kv_transfer_json(connectors: list[str]) -> str:
"""Convert a legacy --connector list to the equivalent --kv-transfer-config JSON.
Used in error messages to help users migrate.
"""
multi_connectors = []
for conn in connectors:
c = conn.lower()
if c == "lmcache":
multi_connectors.append(
{"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"}
)
if has_user_kv_config:
logger.info("Using user-provided kv_transfer_config from --kv-transfer-config")
return None
if not dynamo_config.connector:
logger.info("Using vLLM defaults for kv_transfer_config")
return None
logger.info(
f"Creating kv_transfer_config from --connector {dynamo_config.connector}"
elif c == "nixl":
multi_connectors.append(
{"kv_connector": "NixlConnector", "kv_role": "kv_both"}
)
multi_connectors = []
for conn in dynamo_config.connector:
if conn == "lmcache":
connector_cfg = {"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"}
elif conn == "nixl":
connector_cfg = {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
elif conn == "kvbm":
connector_cfg = {
elif c == "kvbm":
multi_connectors.append(
{
"kv_connector": "DynamoConnector",
"kv_connector_module_path": "kvbm.vllm_integration.connector",
"kv_role": "kv_both",
}
else:
continue
multi_connectors.append(connector_cfg)
)
# For single connector, return direct config
if len(multi_connectors) == 1:
cfg = multi_connectors[0]
return KVTransferConfig(**cfg)
# For multiple connectors, use PdConnector
return KVTransferConfig(
kv_connector="PdConnector",
kv_role="kv_both",
kv_connector_extra_config={"connectors": multi_connectors},
kv_connector_module_path="kvbm.vllm_integration.connector",
return json.dumps(multi_connectors[0])
return json.dumps(
{
"kv_connector": "PdConnector",
"kv_role": "kv_both",
"kv_connector_extra_config": {"connectors": multi_connectors},
"kv_connector_module_path": "kvbm.vllm_integration.connector",
}
)
def _reject_connector_flag(dynamo_config: Config) -> None:
"""Raise ValueError if --connector was explicitly set (CLI or DYN_CONNECTOR env var).
The --connector flag is no longer supported for the vLLM backend.
Users must use --kv-transfer-config instead.
"""
connector_list = dynamo_config.connector or []
# Check if --connector was explicitly provided via CLI or DYN_CONNECTOR env var
env_connector = os.environ.get("DYN_CONNECTOR")
explicitly_set = bool(connector_list) or (env_connector is not None)
if not explicitly_set:
return
# Normalize: "none"/"null" means no connector
normalized = [c.lower() for c in connector_list]
if normalized and all(c in ("none", "null") for c in normalized):
# --connector none/null: tell user it's no longer needed
raise ValueError(
"--connector is no longer supported for the vLLM backend. "
"'--connector none' is no longer needed — the default is already "
"no connector. Simply remove the --connector flag."
)
# Active connectors: show migration path
if normalized:
equiv = _connector_to_kv_transfer_json(normalized)
raise ValueError(
"--connector is no longer supported for the vLLM backend. "
"Use --kv-transfer-config instead.\n"
f" Equivalent: --kv-transfer-config '{equiv}'"
)
# DYN_CONNECTOR env var set but parsed to empty list
if env_connector is not None:
env_values = [v.strip().lower() for v in env_connector.split() if v.strip()]
if env_values and not all(v in ("none", "null") for v in env_values):
equiv = _connector_to_kv_transfer_json(env_values)
raise ValueError(
"The DYN_CONNECTOR environment variable is no longer supported "
"for the vLLM backend. Use --kv-transfer-config instead.\n"
f" Equivalent: --kv-transfer-config '{equiv}'"
)
raise ValueError(
"The DYN_CONNECTOR environment variable is no longer supported "
"for the vLLM backend. Use --kv-transfer-config instead."
)
......
......@@ -52,7 +52,7 @@ from dynamo.runtime import DistributedRuntime, Endpoint
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.worker_factory import WorkerFactory
from .args import Config, parse_args
from .args import Config, _uses_dynamo_connector, parse_args
from .checkpoint_restore import get_checkpoint_config
from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
......@@ -466,9 +466,9 @@ def setup_vllm_engine(config, stat_logger=None):
usage_context = UsageContext.OPENAI_API_SERVER
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
# Set up consolidator endpoints if KVBM is enabled
# Set up consolidator endpoints if KVBM (DynamoConnector) is enabled
consolidator_endpoints = None
if config.has_connector("kvbm"):
if _uses_dynamo_connector(config.engine_args):
try:
from kvbm.vllm_integration.consolidator_config import (
get_consolidator_endpoints,
......
......@@ -3,13 +3,20 @@
"""Unit tests for vLLM backend components."""
import json
import re
import warnings
from pathlib import Path
from types import SimpleNamespace
import pytest
from dynamo.vllm.args import parse_args
from dynamo.vllm.args import (
_connector_to_kv_transfer_json,
_uses_dynamo_connector,
_uses_nixl_connector,
parse_args,
)
from dynamo.vllm.constants import DisaggregationMode
from dynamo.vllm.tests.conftest import make_cli_args_fixture
......@@ -179,6 +186,8 @@ def test_endpoint_overrides_with_prefill_worker(mock_vllm_cli):
"dyn://custom.worker.serve",
"--disaggregation-mode",
"prefill",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
)
config = parse_args()
assert config.namespace == "custom"
......@@ -198,6 +207,95 @@ def test_endpoint_invalid_format_raises(mock_vllm_cli):
parse_args()
# --connector removal tests
def test_connector_nixl_raises_error_with_migration_hint(mock_vllm_cli):
"""Test that --connector nixl raises ValueError with --kv-transfer-config hint."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--connector", "nixl")
with pytest.raises(ValueError, match="--connector is no longer supported"):
parse_args()
def test_connector_none_raises_error(mock_vllm_cli):
"""Test that --connector none raises ValueError telling user it's no longer needed."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--connector", "none")
with pytest.raises(ValueError, match="no longer needed"):
parse_args()
def test_env_var_dyn_connector_raises_error(monkeypatch, mock_vllm_cli):
"""Test that DYN_CONNECTOR env var raises error for vLLM backend."""
monkeypatch.setenv("DYN_CONNECTOR", "nixl")
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B")
with pytest.raises(ValueError, match="no longer supported"):
parse_args()
def test_prefill_worker_without_kv_transfer_config_raises(mock_vllm_cli):
"""Test that --disaggregation-mode prefill without --kv-transfer-config raises ValueError."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--disaggregation-mode", "prefill")
with pytest.raises(ValueError, match="--kv-transfer-config"):
parse_args()
def test_connector_to_kv_transfer_json_single():
"""Test _connector_to_kv_transfer_json returns valid JSON for a single connector."""
result = json.loads(_connector_to_kv_transfer_json(["nixl"]))
assert result == {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
def test_connector_to_kv_transfer_json_multi():
"""Test _connector_to_kv_transfer_json wraps multiple connectors in PdConnector."""
result = json.loads(_connector_to_kv_transfer_json(["kvbm", "nixl"]))
assert result["kv_connector"] == "PdConnector"
nested = result["kv_connector_extra_config"]["connectors"]
nested_names = [c["kv_connector"] for c in nested]
assert "DynamoConnector" in nested_names
assert "NixlConnector" in nested_names
# _uses_nixl_connector / _uses_dynamo_connector tests
def _make_engine_cfg(kv_connector=None, extra_config=None):
"""Build a minimal fake engine config for connector detection tests."""
if kv_connector is None:
return SimpleNamespace(kv_transfer_config=None)
return SimpleNamespace(
kv_transfer_config=SimpleNamespace(
kv_connector=kv_connector,
kv_connector_extra_config=extra_config,
)
)
_PD_KVBM_NIXL = {
"connectors": [
{"kv_connector": "DynamoConnector", "kv_role": "kv_both"},
{"kv_connector": "NixlConnector", "kv_role": "kv_both"},
]
}
def test_uses_nixl_connector_direct_and_nested():
"""Test _uses_nixl_connector for direct, nested-in-PdConnector, and absent cases."""
assert _uses_nixl_connector(_make_engine_cfg("NixlConnector")) is True
assert _uses_nixl_connector(_make_engine_cfg("PdConnector", _PD_KVBM_NIXL)) is True
assert _uses_nixl_connector(_make_engine_cfg("LMCacheConnectorV1")) is False
assert _uses_nixl_connector(_make_engine_cfg()) is False
def test_uses_dynamo_connector_direct_and_nested():
"""Test _uses_dynamo_connector for direct, nested-in-PdConnector, and absent cases."""
assert _uses_dynamo_connector(_make_engine_cfg("DynamoConnector")) is True
assert (
_uses_dynamo_connector(_make_engine_cfg("PdConnector", _PD_KVBM_NIXL)) is True
)
assert _uses_dynamo_connector(_make_engine_cfg("NixlConnector")) is False
assert _uses_dynamo_connector(_make_engine_cfg()) is False
def test_headless_namespace_has_required_fields(mock_vllm_cli):
"""Test that build_headless_namespace produces a Namespace with fields
required by vLLM's run_headless(), including the api_server_count fallback."""
......@@ -235,7 +333,14 @@ def test_disaggregation_mode_default(mock_vllm_cli):
def test_disaggregation_mode_prefill(mock_vllm_cli):
"""Test --disaggregation-mode prefill sets correct state."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--disaggregation-mode", "prefill")
mock_vllm_cli(
"--model",
"Qwen/Qwen3-0.6B",
"--disaggregation-mode",
"prefill",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
)
config = parse_args()
assert config.disaggregation_mode == DisaggregationMode.PREFILL
assert config.is_prefill_worker is True
......@@ -254,7 +359,13 @@ def test_disaggregation_mode_decode(mock_vllm_cli):
def test_legacy_is_prefill_worker_emits_deprecation(mock_vllm_cli):
"""Test that --is-prefill-worker still works but emits DeprecationWarning."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B", "--is-prefill-worker")
mock_vllm_cli(
"--model",
"Qwen/Qwen3-0.6B",
"--is-prefill-worker",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
config = parse_args()
......
......@@ -161,7 +161,7 @@ vLLM workers are configured through command-line arguments. Key parameters inclu
- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`)
- `--disaggregation-mode <mode>`: Worker role for disaggregated serving. Accepted values: `prefill`, `decode`, `agg` (default)
- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo
- `--connector`: Specify which kv_transfer_config you want vllm to use `[nixl, lmcache, kvbm, none]`. This is a helper flag which overwrites the engines KVTransferConfig.
- `--kv-transfer-config`: JSON string specifying the vLLM KVTransferConfig (e.g., `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'`). See vLLM documentation for details.
- `--enable-prompt-embeds`: **Enable prompt embeddings feature** (opt-in, default: disabled)
- **Required for:** Accepting pre-computed prompt embeddings via API
- **Default behavior:** Prompt embeddings DISABLED - requests with `prompt_embeds` will fail
......
......@@ -23,7 +23,7 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
| Variable/Flag | Description | Default | Example |
|---------------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
| `--kv-transfer-config` | KV transfer configuration JSON. Use LMCache connector to enable LMCache metrics. | - | `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` |
## Getting Started Quickly
......@@ -109,18 +109,18 @@ For the complete and authoritative list of all vLLM metrics, see the [official v
## LMCache Metrics
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
When LMCache is enabled with `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
### Minimum Requirements
To access LMCache metrics, both of these are required:
1. `--connector lmcache` - Enables LMCache in vLLM
1. `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` - Enables LMCache in vLLM
2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
**Example:**
```bash
DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
```
### Viewing LMCache Metrics
......
......@@ -161,7 +161,7 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f
| `--omni` | Enable the vLLM-Omni orchestrator (required for all omni workloads) |
| `--output-modalities <modality>` | Output modality: `text`, `image`, or `video` |
| `--stage-configs-path <path>` | Path to stage config YAML (optional; vLLM-Omni uses model defaults if omitted) |
| `--connector none` | Disable KV connector (recommended for omni workers) |
| _(no `--kv-transfer-config`)_ | KV connector is disabled by default; omit the flag for omni workers |
| `--media-output-fs-url <url>` | Filesystem URL for storing generated media (default: `file:///tmp/dynamo_media`) |
| `--media-output-http-url <url>` | Base URL for rewriting media paths in responses (optional) |
......
......@@ -264,7 +264,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--connector kvbm
--kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
```
### Enable Metrics for TensorRT-LLM
......@@ -420,7 +420,7 @@ python -m dynamo.frontend &
DYN_KVBM_CPU_CACHE_GB=10 \
nsys profile -o /tmp/kvbm-nsys --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector kvbm
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
```
## See Also
......
......@@ -81,7 +81,6 @@ The LoRA system consists of:
# Start vLLM worker with LoRA flags
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \
--connector none \
--enable-lora \
--max-lora-rank 64
```
......
......@@ -34,11 +34,11 @@ title: FlexKV
### Enable FlexKV
Set the `DYNAMO_USE_FLEXKV` environment variable and use the `--connector flexkv` flag:
Set the `DYNAMO_USE_FLEXKV` environment variable and use the `--kv-transfer-config` flag:
```bash
export DYNAMO_USE_FLEXKV=1
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
```
## Aggregated Serving
......@@ -52,7 +52,7 @@ python -m dynamo.frontend &
# Terminal 2: Start vLLM worker with FlexKV
DYNAMO_USE_FLEXKV=1 \
FLEXKV_CPU_CACHE_GB=32 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
```
### With KV-Aware Routing
......@@ -72,7 +72,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_0" \
CUDA_VISIBLE_DEVICES=0 \
python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--connector flexkv \
--kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.2 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
......@@ -83,7 +83,7 @@ FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_1" \
CUDA_VISIBLE_DEVICES=1 \
python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--connector flexkv \
--kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.2 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
```
......@@ -97,7 +97,7 @@ FlexKV can be used with disaggregated prefill/decode serving. The prefill worker
python -m dynamo.frontend &
# Terminal 2: Decode worker (without FlexKV)
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
# Terminal 3: Prefill worker (with FlexKV)
DYN_VLLM_KV_EVENT_PORT=20081 \
......@@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=1 \
python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \
--connector nixl flexkv
--kv-transfer-config '{"kv_connector":"FlexKVConnector","kv_role":"kv_both"}'
```
## Configuration
......
......@@ -20,10 +20,10 @@ This document describes how LMCache is integrated into Dynamo's vLLM backend to
### Configuration
LMCache is enabled using the `--connector lmcache` flag:
LMCache is enabled using the `--kv-transfer-config` flag:
```bash
python -m dynamo.vllm --model <model_name> --connector lmcache
python -m dynamo.vllm --model <model_name> --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
```
### Customization
......@@ -157,11 +157,11 @@ kv_transfer_config = KVTransferConfig(
## Metrics and Monitoring
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
When LMCache is enabled with `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
**Requirements to access LMCache metrics:**
- `--connector lmcache` - Enables LMCache
- `--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'` - Enables LMCache
- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally
......
......@@ -26,7 +26,7 @@ title: Integration README
```bash
# Add installation and usage from existing integration docs
# Example pattern (LMCache):
# python -m dynamo.vllm --model <model> --connector lmcache
# python -m dynamo.vllm --model <model> --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
```
## Configuration
......
......@@ -86,7 +86,7 @@ extraPodSpec:
- `--enable-prompt-embeds`: Enable prompt embeddings feature
- `--enable-multimodal`: Enable multimodal (vision) support
- `--disaggregation-mode prefill`: Prefill-only mode for disaggregated serving
- `--connector [nixl|lmcache|kvbm|none]`: KV transfer backend selection
- `--kv-transfer-config '<json>'`: KV transfer backend configuration (e.g., `'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'`)
## Prerequisites
......
......@@ -41,5 +41,5 @@ spec:
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- --kv-transfer-config
- '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}'
......@@ -44,8 +44,6 @@ spec:
- --tensor-parallel-size
- "2"
- --is-decode-worker
- --connector
- none
- --kv-transfer-config
- '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id":
"vllm-disagg-decode-engine-0abc123"}'
......@@ -73,8 +71,6 @@ spec:
- "2"
- --disaggregation-mode
- prefill
- --connector
- none
- --kv-transfer-config
- '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id":
"vllm-disagg-prefill-engine-0abc123"}'
......@@ -66,6 +66,5 @@ spec:
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
- --kv-transfer-config
- '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
......@@ -66,6 +66,5 @@ spec:
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
- --kv-transfer-config
- '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
......@@ -74,8 +74,7 @@ spec:
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
- --kv-transfer-config
- '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}'
- --tensor-parallel-size
- "2"
......@@ -60,8 +60,6 @@ spec:
args:
- --model
- Qwen/Qwen3-0.6B
- --connector
- none
- --enable-lora
- --max-lora-rank
- "64"
......
......@@ -29,4 +29,4 @@ python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager --connector none "${EXTRA_ARGS[@]}"
python -m dynamo.vllm --model "$MODEL" --enforce-eager "${EXTRA_ARGS[@]}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment