Unverified Commit 115512ef authored by Alec's avatar Alec Committed by GitHub
Browse files

chore: remove deprecated automatic kv events config for vLLM LLM-94 (#7591)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 3a1561fb
...@@ -6,7 +6,6 @@ import json ...@@ -6,7 +6,6 @@ import json
import logging import logging
import os import os
import socket import socket
import warnings
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from vllm.distributed.kv_events import KVEventsConfig from vllm.distributed.kv_events import KVEventsConfig
...@@ -321,28 +320,7 @@ def create_kv_events_config( ...@@ -321,28 +320,7 @@ def create_kv_events_config(
logger.info(f"Using user-provided kv_events_config {c}") logger.info(f"Using user-provided kv_events_config {c}")
return c return c
# Create default events config for prefix caching return None
# TODO: move this to configuration system.
port = envs.DYN_VLLM_KV_EVENT_PORT
warnings.warn(
"Automatic KV events configuration is deprecated and will be removed in "
"the next release. After that, KV events will be disabled by default "
"(matching upstream vLLM). To preserve current behavior, pass "
"--kv-events-config explicitly. For example:\n"
f' --kv-events-config \'{{"enable_kv_cache_events":true,"publisher":"zmq","endpoint":"tcp://*:{port}"}}\'\n'
"See docs/backends/vllm/README.md for details.",
FutureWarning,
stacklevel=2,
)
logger.info(
f"Using env-var DYN_VLLM_KV_EVENT_PORT={port} to create kv_events_config"
)
dp_rank = engine_config.data_parallel_rank or 0
return KVEventsConfig(
enable_kv_cache_events=True,
publisher="zmq",
endpoint=f"tcp://*:{port - dp_rank}", # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
)
def _uses_nixl_connector(engine_config: AsyncEngineArgs) -> bool: def _uses_nixl_connector(engine_config: AsyncEngineArgs) -> bool:
......
...@@ -19,7 +19,6 @@ REGISTERED_PORT_MIN = 1024 ...@@ -19,7 +19,6 @@ REGISTERED_PORT_MIN = 1024
REGISTERED_PORT_MAX = 49151 REGISTERED_PORT_MAX = 49151
if TYPE_CHECKING: if TYPE_CHECKING:
DYN_VLLM_KV_EVENT_PORT: int = 20080
DYN_FORWARDPASS_METRIC_PORT: int = 20380 DYN_FORWARDPASS_METRIC_PORT: int = 20380
...@@ -59,9 +58,6 @@ def _resolve_port(env_var: str, default_port: int) -> int: ...@@ -59,9 +58,6 @@ def _resolve_port(env_var: str, default_port: int) -> int:
# Environment variables configuration # Environment variables configuration
environment_variables: dict[str, Callable[[], Any]] = { environment_variables: dict[str, Callable[[], Any]] = {
# Port used for KV events publishing to the frontend
# Note: This env variable is ignored if explicitly using --kv-events-config ''
"DYN_VLLM_KV_EVENT_PORT": lambda: _resolve_port("DYN_VLLM_KV_EVENT_PORT", 20080),
"DYN_FORWARDPASS_METRIC_PORT": lambda: _resolve_port( "DYN_FORWARDPASS_METRIC_PORT": lambda: _resolve_port(
"DYN_FORWARDPASS_METRIC_PORT", 20380 "DYN_FORWARDPASS_METRIC_PORT", 20380
), ),
......
...@@ -332,6 +332,14 @@ def test_disaggregation_mode_default(mock_vllm_cli): ...@@ -332,6 +332,14 @@ def test_disaggregation_mode_default(mock_vllm_cli):
assert config.is_decode_worker is False assert config.is_decode_worker is False
def test_kv_events_disabled_by_default_without_explicit_config(mock_vllm_cli):
"""Test that vLLM no longer auto-creates kv_events_config."""
mock_vllm_cli("--model", "Qwen/Qwen3-0.6B")
config = parse_args()
assert config.engine_args.kv_events_config is None
assert config.use_kv_events is False
def test_disaggregation_mode_prefill(mock_vllm_cli): def test_disaggregation_mode_prefill(mock_vllm_cli):
"""Test --disaggregation-mode prefill sets correct state.""" """Test --disaggregation-mode prefill sets correct state."""
mock_vllm_cli( mock_vllm_cli(
......
...@@ -171,7 +171,7 @@ For dependency-free local development, disable KV event publishing (avoids NATS) ...@@ -171,7 +171,7 @@ For dependency-free local development, disable KV event publishing (avoids NATS)
- **SGLang:** No flag needed (KV events disabled by default) - **SGLang:** No flag needed (KV events disabled by default)
- **TensorRT-LLM:** No flag needed (KV events disabled by default) - **TensorRT-LLM:** No flag needed (KV events disabled by default)
vLLM automatically enables KV event publishing when prefix caching is active. In a future release, KV events will be disabled by default for all backends. Start using `--kv-events-config` explicitly to prepare. KV events are disabled by default for all backends. Add `--kv-events-config` explicitly only when you want KV event publishing enabled.
## Test Your Deployment ## Test Your Deployment
......
...@@ -98,7 +98,6 @@ python -m dynamo.frontend & ...@@ -98,7 +98,6 @@ python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
# Terminal 3: Prefill worker (with FlexKV) # Terminal 3: Prefill worker (with FlexKV)
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYNAMO_USE_FLEXKV=1 \ DYNAMO_USE_FLEXKV=1 \
FLEXKV_CPU_CACHE_GB=32 \ FLEXKV_CPU_CACHE_GB=32 \
...@@ -106,7 +105,8 @@ CUDA_VISIBLE_DEVICES=1 \ ...@@ -106,7 +105,8 @@ CUDA_VISIBLE_DEVICES=1 \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}' --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
``` ```
## Configuration ## Configuration
...@@ -203,4 +203,3 @@ curl localhost:8000/v1/chat/completions \ ...@@ -203,4 +203,3 @@ curl localhost:8000/v1/chat/completions \
- [FlexKV GitHub Repository](https://github.com/taco-project/FlexKV) - [FlexKV GitHub Repository](https://github.com/taco-project/FlexKV)
- [FlexKV vLLM Adapter Documentation](https://github.com/taco-project/FlexKV/blob/main/docs/vllm_adapter/README_en.md) - [FlexKV vLLM Adapter Documentation](https://github.com/taco-project/FlexKV/blob/main/docs/vllm_adapter/README_en.md)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment