chore: unify LMCache UX w/ and w/o Dynamo (#4509)

Signed-off-by: Ziqi Fan <ziqif@nvidia.com>

chore: unify LMCache UX w/ and w/o Dynamo (#4509)
Signed-off-by: Ziqi Fan <ziqif@nvidia.com>
6d69e8c7 · Ziqi Fan · GitHub · b1286e7f · 6d69e8c7 · 6d69e8c7
Unverified Commit 6d69e8c7 authored Nov 21, 2025 by Ziqi Fan Committed by GitHub Nov 21, 2025
11 changed files
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -26,9 +26,6 @@ logger = logging.getLogger(__name__)
 DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
 VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"}

-# Global LMCache configuration - initialize once on module import
-ENABLE_LMCACHE = os.getenv("ENABLE_LMCACHE", "0").lower() in ("1", "true", "yes")
-

 class Config:
    """Command line parameters or defaults"""

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -33,7 +33,7 @@ from dynamo.vllm.multimodal_handlers import (
    ProcessorHandler,
 )

-from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args
+from .args import Config, overwrite_args, parse_args
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
 from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
 from .publisher import StatLoggerFactory
@@ -42,22 +42,6 @@ configure_dynamo_logging()
 logger = logging.getLogger(__name__)


-def setup_lmcache_environment():
-    """Setup LMCache environment variables for KV cache offloading"""
-    # LMCache configuration for matching logic
-    lmcache_config = {
-        "LMCACHE_CHUNK_SIZE": "256",  # Token chunk size
-        "LMCACHE_LOCAL_CPU": "True",  # Enable CPU memory backend
-        "LMCACHE_MAX_LOCAL_CPU_SIZE": "20",  # CPU memory limit in GB
-    }
-
-    # Set environment variables
-    for key, value in lmcache_config.items():
-        if key not in os.environ:  # Only set if not already configured
-            os.environ[key] = value
-            logger.info(f"Set LMCache environment variable: {key}={value}")
-
-
 async def graceful_shutdown(runtime):
    """
    Shutdown dynamo distributed runtime.
@@ -214,13 +198,6 @@ def setup_vllm_engine(config, stat_logger=None):

    engine_args = config.engine_args

-    # KV transfer config is now handled by args.py based on ENABLE_LMCACHE env var
-    if ENABLE_LMCACHE:
-        setup_lmcache_environment()
-        logger.info("LMCache enabled for VllmWorker")
-    else:
-        logger.debug("LMCache is disabled")
-
    # Load default sampling params from `generation_config.json`
    default_sampling_params = (
        engine_args.create_model_config().get_diff_sampling_param()
@@ -258,12 +235,8 @@ def setup_vllm_engine(config, stat_logger=None):
        disable_log_requests=engine_args.disable_log_requests,
        disable_log_stats=engine_args.disable_log_stats,
    )
-    if ENABLE_LMCACHE:
-        logger.info(
-            f"VllmWorker for {config.served_model_name} has been initialized with LMCache"
-        )
-    else:
-        logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
+
+    logger.info(f"VllmWorker for {config.served_model_name} has been initialized")

    return engine_client, vllm_config, default_sampling_params, prometheus_temp_dir


--- a/docs/backends/vllm/LMCache_Integration.md
+++ b/docs/backends/vllm/LMCache_Integration.md
@@ -26,37 +26,9 @@ LMCache is enabled using the `--connector lmcache` flag:
 python -m dynamo.vllm --model <model_name> --connector lmcache
 ```

-**The `--connector lmcache` flag is required** to enable LMCache in vLLM. Optionally set `ENABLE_LMCACHE=1` to use Dynamo's default LMCache configuration values, or set individual `LMCACHE_*` environment variables for custom configuration.
+### Customization

-### Environment Variables
-
-LMCache configuration can be customized via environment variables:
-
-**Option 1: Use Dynamo Defaults (Recommended)**
-```bash
-export ENABLE_LMCACHE=1  # Sets Dynamo's recommended defaults
-python -m dynamo.vllm --model <model_name> --connector lmcache
-```
-
-Dynamo sets these defaults when `ENABLE_LMCACHE=1`:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity
- `LMCACHE_LOCAL_CPU=True` - Enable CPU memory backend for offloading
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB
-
-**Option 2: Set Individual Variables**
-```bash
-export LMCACHE_CHUNK_SIZE=256
-export LMCACHE_LOCAL_CPU=True
-export LMCACHE_MAX_LOCAL_CPU_SIZE=20
-python -m dynamo.vllm --model <model_name> --connector lmcache
-```
-
-**Option 3: Use LMCache Defaults**
-```bash
-# Just use --connector lmcache without env vars
-python -m dynamo.vllm --model <model_name> --connector lmcache
-# LMCache will use its own defaults (chunk_size=256, local_cpu=True, max_local_cpu_size=5GB)
-```
+LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html).

 For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
 - **CPU RAM**: Fast local memory offloading
@@ -87,10 +59,6 @@ In aggregated mode, the system uses:

 Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments.

-### Configuration
-
-The same `ENABLE_LMCACHE=1` environment variable enables LMCache, but the system automatically configures different connector setups for prefill and decode workers.
-
 ### Deployment

 Use the provided disaggregated launch script(the script requires at least 2 GPUs):
@@ -127,7 +95,7 @@ The system automatically configures KV transfer based on the deployment mode and
 #### Prefill Worker (Disaggregated Mode)
 ```python
 kv_transfer_config = KVTransferConfig(
-    kv_connector="MultiConnector",
+    kv_connector="PdConnector",
    kv_role="kv_both",
    kv_connector_extra_config={
        "connectors": [
@@ -154,22 +122,9 @@ kv_transfer_config = KVTransferConfig(
 )
 ```

-### Environment Setup
-
-The system automatically configures LMCache environment variables when enabled:
-
-```python
-lmcache_config = {
-    "LMCACHE_CHUNK_SIZE": "256",
-    "LMCACHE_LOCAL_CPU": "True",
-    "LMCACHE_MAX_LOCAL_CPU_SIZE": "20"
-}
-```
-
 ### Integration Points

 1. **Argument Parsing** (`args.py`):
-   - Detects `ENABLE_LMCACHE` environment variable
   - Configures appropriate KV transfer settings
   - Sets up connector configurations based on worker type


--- a/docs/backends/vllm/prometheus.md
+++ b/docs/backends/vllm/prometheus.md
@@ -23,7 +23,6 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
 |---------------|-------------|---------|---------|
 | `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
 | `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
-| `ENABLE_LMCACHE` | Sets Dynamo's recommended LMCache defaults (optional). | Not set | `ENABLE_LMCACHE=1` |

 ## Getting Started Quickly

@@ -117,18 +116,12 @@ To access LMCache metrics, both of these are required:
 1. `--connector lmcache` - Enables LMCache in vLLM
 2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint

-**Minimal example:**
+**Example:**
 ```bash
 DYN_SYSTEM_PORT=8081 \
 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
 ```

-**Recommended (with Dynamo defaults):**
-```bash
-DYN_SYSTEM_PORT=8081 ENABLE_LMCACHE=1 \
-python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
-```
-
 ### Viewing LMCache Metrics

 ```bash

--- a/examples/backends/vllm/launch/agg_lmcache.sh
+++ b/examples/backends/vllm/launch/agg_lmcache.sh
@@ -9,8 +9,4 @@ python -m dynamo.frontend --http-port=8000 &

 # run worker with LMCache enabled
 DYN_SYSTEM_PORT=8081 \
-ENABLE_LMCACHE=1 \
-LMCACHE_CHUNK_SIZE=256 \
-LMCACHE_LOCAL_CPU=True \
-LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
--- a/examples/backends/vllm/launch/disagg_lmcache.sh
+++ b/examples/backends/vllm/launch/disagg_lmcache.sh
@@ -16,10 +16,6 @@ sleep 20
 # run prefill worker on GPU 1 with LMCache
 DYN_VLLM_KV_EVENT_PORT=20081 \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
-ENABLE_LMCACHE=1 \
-LMCACHE_CHUNK_SIZE=256 \
-LMCACHE_LOCAL_CPU=True \
-LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
 CUDA_VISIBLE_DEVICES=1 \
  python3 -m dynamo.vllm \
    --model Qwen/Qwen3-0.6B \

--- a/tests/lmcache/README.md
+++ b/tests/lmcache/README.md
@@ -5,8 +5,8 @@ Test the correctness of Dynamo integration with LMCache by comparing MMLU benchm

 ## Testing Principle
 Compare MMLU test results under two configurations:
- **Baseline Test**: Dynamo without LMCache (`ENABLE_LMCACHE=0`)
- **LMCache Test**: Dynamo with LMCache enabled (`ENABLE_LMCACHE=1`)
+- **Baseline Test**: Dynamo without LMCache
+- **LMCache Test**: Dynamo with LMCache enabled

 If both configurations produce the same inference results, it verifies that LMCache functionality is correct.

@@ -63,14 +63,12 @@ python3 summarize_scores_dynamo.py
 ### Baseline Architecture (deploy-baseline-dynamo.sh)
 ```
 HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → Direct Inference
-Environment: ENABLE_LMCACHE=0
 ```

 ### LMCache Architecture (deploy-lmcache_enabled-dynamo.sh)
 ```
 HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → LMCache-enabled Inference
-Environment: ENABLE_LMCACHE=1
-            LMCACHE_CHUNK_SIZE=256
+Environment:LMCACHE_CHUNK_SIZE=256
            LMCACHE_LOCAL_CPU=True
            LMCACHE_MAX_LOCAL_CPU_SIZE=1.0
 ```

--- a/tests/lmcache/deploy-baseline-dynamo-disag.sh
+++ b/tests/lmcache/deploy-baseline-dynamo-disag.sh
@@ -30,8 +30,6 @@ echo "🧹 Cleaning up any existing dynamo processes..."
 pkill -f "dynamo-run" || true
 sleep 2

-# Disable LMCache
-export ENABLE_LMCACHE=0
 echo "🔧 Starting dynamo disaggregated serving without LMCache..."

 python -m dynamo.frontend &

--- a/tests/lmcache/deploy-baseline-dynamo.sh
+++ b/tests/lmcache/deploy-baseline-dynamo.sh
@@ -28,10 +28,7 @@ echo "🧹 Cleaning up any existing dynamo processes..."
 pkill -f "dynamo-run" || true
 sleep 2

-# Disable LMCache
-export ENABLE_LMCACHE=0
 echo "🔧 Starting dynamo worker without LMCache..."

-
 python -m dynamo.frontend &
 python3 -m dynamo.vllm --model $MODEL_URL
\ No newline at end of file
--- a/tests/lmcache/deploy-lmcache_enabled-dynamo-disag.sh
+++ b/tests/lmcache/deploy-lmcache_enabled-dynamo-disag.sh
@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL&
 sleep 20

 # run prefill worker on GPU 1 with LMCache
-ENABLE_LMCACHE=1 \
 LMCACHE_CHUNK_SIZE=256 \
 LMCACHE_LOCAL_CPU=True \
 LMCACHE_MAX_LOCAL_CPU_SIZE=20 \

--- a/tests/lmcache/deploy-lmcache_enabled-dynamo.sh
+++ b/tests/lmcache/deploy-lmcache_enabled-dynamo.sh
@@ -32,5 +32,5 @@ sleep 2
 echo "🔧 Starting dynamo worker with LMCache enabled..."

 python -m dynamo.frontend &
-ENABLE_LMCACHE=1 \
-  python3 -m dynamo.vllm --model $MODEL_URL
\ No newline at end of file
+
+python3 -m dynamo.vllm --model $MODEL_URL --connector lmcache
\ No newline at end of file