chore: remove deprecated automatic kv events config for vLLM LLM-94 (#7591)

Signed-off-by: alec-flowers <aflowers@nvidia.com>

chore: remove deprecated automatic kv events config for vLLM LLM-94 (#7591)
Signed-off-by: alec-flowers <aflowers@nvidia.com>
115512ef · Alec · GitHub · 3a1561fb · 115512ef · 115512ef
Unverified Commit 115512ef authored Mar 23, 2026 by Alec Committed by GitHub Mar 24, 2026
5 changed files
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -6,7 +6,6 @@ import json
 import logging
 import os
 import socket
-import warnings
 from typing import Any, Dict, Optional
 from vllm.distributed.kv_events import KVEventsConfig
@@ -321,28 +320,7 @@ def create_kv_events_config(
        logger.info(f"Using user-provided kv_events_config {c}")
        return c
-    # Create default events config for prefix caching
+    return None
-    # TODO: move this to configuration system.
-    port = envs.DYN_VLLM_KV_EVENT_PORT
-    warnings.warn(
-        "Automatic KV events configuration is deprecated and will be removed in "
-        "the next release. After that, KV events will be disabled by default "
-        "(matching upstream vLLM). To preserve current behavior, pass "
-        "--kv-events-config explicitly. For example:\n"
-        f'  --kv-events-config \'{{"enable_kv_cache_events":true,"publisher":"zmq","endpoint":"tcp://*:{port}"}}\'\n'
-        "See docs/backends/vllm/README.md for details.",
-        FutureWarning,
-        stacklevel=2,
-    )
-    logger.info(
-        f"Using env-var DYN_VLLM_KV_EVENT_PORT={port} to create kv_events_config"
-    )
-    dp_rank = engine_config.data_parallel_rank or 0
-    return KVEventsConfig(
-        enable_kv_cache_events=True,
-        publisher="zmq",
-        endpoint=f"tcp://*:{port - dp_rank}",  # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
-    )
 def _uses_nixl_connector(engine_config: AsyncEngineArgs) -> bool:

--- a/components/src/dynamo/vllm/envs.py
+++ b/components/src/dynamo/vllm/envs.py
@@ -19,7 +19,6 @@ REGISTERED_PORT_MIN = 1024
 REGISTERED_PORT_MAX = 49151
 if TYPE_CHECKING:
-    DYN_VLLM_KV_EVENT_PORT: int = 20080
    DYN_FORWARDPASS_METRIC_PORT: int = 20380
@@ -59,9 +58,6 @@ def _resolve_port(env_var: str, default_port: int) -> int:
 # Environment variables configuration
 environment_variables: dict[str, Callable[[], Any]] = {
-    # Port used for KV events publishing to the frontend
-    # Note: This env variable is ignored if explicitly using --kv-events-config ''
-    "DYN_VLLM_KV_EVENT_PORT": lambda: _resolve_port("DYN_VLLM_KV_EVENT_PORT", 20080),
    "DYN_FORWARDPASS_METRIC_PORT": lambda: _resolve_port(
        "DYN_FORWARDPASS_METRIC_PORT", 20380
    ),

--- a/components/src/dynamo/vllm/tests/test_vllm_unit.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_unit.py
@@ -332,6 +332,14 @@ def test_disaggregation_mode_default(mock_vllm_cli):
    assert config.is_decode_worker is False
+def test_kv_events_disabled_by_default_without_explicit_config(mock_vllm_cli):
+    """Test that vLLM no longer auto-creates kv_events_config."""
+    mock_vllm_cli("--model", "Qwen/Qwen3-0.6B")
+    config = parse_args()
+    assert config.engine_args.kv_events_config is None
+    assert config.use_kv_events is False
 def test_disaggregation_mode_prefill(mock_vllm_cli):
    """Test --disaggregation-mode prefill sets correct state."""
    mock_vllm_cli(

--- a/docs/getting-started/local-installation.md
+++ b/docs/getting-started/local-installation.md
@@ -171,7 +171,7 @@ For dependency-free local development, disable KV event publishing (avoids NATS)
 - **SGLang:** No flag needed (KV events disabled by default)
 - **TensorRT-LLM:** No flag needed (KV events disabled by default)
-vLLM automatically enables KV event publishing when prefix caching is active. In a future release, KV events will be disabled by default for all backends. Start using `--kv-events-config` explicitly to prepare.
+KV events are disabled by default for all backends. Add `--kv-events-config` explicitly only when you want KV event publishing enabled.
 ## Test Your Deployment

--- a/docs/integrations/flexkv-integration.md
+++ b/docs/integrations/flexkv-integration.md
@@ -98,7 +98,6 @@ python -m dynamo.frontend &
 CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
 # Terminal 3: Prefill worker (with FlexKV)
-DYN_VLLM_KV_EVENT_PORT=20081 \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 DYNAMO_USE_FLEXKV=1 \
 FLEXKV_CPU_CACHE_GB=32 \
@@ -106,7 +105,8 @@ CUDA_VISIBLE_DEVICES=1 \
  python -m dynamo.vllm \
  --model Qwen/Qwen3-0.6B \
  --disaggregation-mode prefill \
-  --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}'
+  --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}' \
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
 ```
 ## Configuration
@@ -203,4 +203,3 @@ curl localhost:8000/v1/chat/completions \
 - [FlexKV GitHub Repository](https://github.com/taco-project/FlexKV)
 - [FlexKV vLLM Adapter Documentation](https://github.com/taco-project/FlexKV/blob/main/docs/vllm_adapter/README_en.md)