Unverified Commit 6d69e8c7 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

chore: unify LMCache UX w/ and w/o Dynamo (#4509)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent b1286e7f
......@@ -26,9 +26,6 @@ logger = logging.getLogger(__name__)
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"}
# Global LMCache configuration - initialize once on module import
ENABLE_LMCACHE = os.getenv("ENABLE_LMCACHE", "0").lower() in ("1", "true", "yes")
class Config:
"""Command line parameters or defaults"""
......
......@@ -33,7 +33,7 @@ from dynamo.vllm.multimodal_handlers import (
ProcessorHandler,
)
from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args
from .args import Config, overwrite_args, parse_args
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .publisher import StatLoggerFactory
......@@ -42,22 +42,6 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__)
def setup_lmcache_environment():
"""Setup LMCache environment variables for KV cache offloading"""
# LMCache configuration for matching logic
lmcache_config = {
"LMCACHE_CHUNK_SIZE": "256", # Token chunk size
"LMCACHE_LOCAL_CPU": "True", # Enable CPU memory backend
"LMCACHE_MAX_LOCAL_CPU_SIZE": "20", # CPU memory limit in GB
}
# Set environment variables
for key, value in lmcache_config.items():
if key not in os.environ: # Only set if not already configured
os.environ[key] = value
logger.info(f"Set LMCache environment variable: {key}={value}")
async def graceful_shutdown(runtime):
"""
Shutdown dynamo distributed runtime.
......@@ -214,13 +198,6 @@ def setup_vllm_engine(config, stat_logger=None):
engine_args = config.engine_args
# KV transfer config is now handled by args.py based on ENABLE_LMCACHE env var
if ENABLE_LMCACHE:
setup_lmcache_environment()
logger.info("LMCache enabled for VllmWorker")
else:
logger.debug("LMCache is disabled")
# Load default sampling params from `generation_config.json`
default_sampling_params = (
engine_args.create_model_config().get_diff_sampling_param()
......@@ -258,12 +235,8 @@ def setup_vllm_engine(config, stat_logger=None):
disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats,
)
if ENABLE_LMCACHE:
logger.info(
f"VllmWorker for {config.served_model_name} has been initialized with LMCache"
)
else:
logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
return engine_client, vllm_config, default_sampling_params, prometheus_temp_dir
......
......@@ -26,37 +26,9 @@ LMCache is enabled using the `--connector lmcache` flag:
python -m dynamo.vllm --model <model_name> --connector lmcache
```
**The `--connector lmcache` flag is required** to enable LMCache in vLLM. Optionally set `ENABLE_LMCACHE=1` to use Dynamo's default LMCache configuration values, or set individual `LMCACHE_*` environment variables for custom configuration.
### Customization
### Environment Variables
LMCache configuration can be customized via environment variables:
**Option 1: Use Dynamo Defaults (Recommended)**
```bash
export ENABLE_LMCACHE=1 # Sets Dynamo's recommended defaults
python -m dynamo.vllm --model <model_name> --connector lmcache
```
Dynamo sets these defaults when `ENABLE_LMCACHE=1`:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity
- `LMCACHE_LOCAL_CPU=True` - Enable CPU memory backend for offloading
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB
**Option 2: Set Individual Variables**
```bash
export LMCACHE_CHUNK_SIZE=256
export LMCACHE_LOCAL_CPU=True
export LMCACHE_MAX_LOCAL_CPU_SIZE=20
python -m dynamo.vllm --model <model_name> --connector lmcache
```
**Option 3: Use LMCache Defaults**
```bash
# Just use --connector lmcache without env vars
python -m dynamo.vllm --model <model_name> --connector lmcache
# LMCache will use its own defaults (chunk_size=256, local_cpu=True, max_local_cpu_size=5GB)
```
LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html).
For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
- **CPU RAM**: Fast local memory offloading
......@@ -87,10 +59,6 @@ In aggregated mode, the system uses:
Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments.
### Configuration
The same `ENABLE_LMCACHE=1` environment variable enables LMCache, but the system automatically configures different connector setups for prefill and decode workers.
### Deployment
Use the provided disaggregated launch script(the script requires at least 2 GPUs):
......@@ -127,7 +95,7 @@ The system automatically configures KV transfer based on the deployment mode and
#### Prefill Worker (Disaggregated Mode)
```python
kv_transfer_config = KVTransferConfig(
kv_connector="MultiConnector",
kv_connector="PdConnector",
kv_role="kv_both",
kv_connector_extra_config={
"connectors": [
......@@ -154,22 +122,9 @@ kv_transfer_config = KVTransferConfig(
)
```
### Environment Setup
The system automatically configures LMCache environment variables when enabled:
```python
lmcache_config = {
"LMCACHE_CHUNK_SIZE": "256",
"LMCACHE_LOCAL_CPU": "True",
"LMCACHE_MAX_LOCAL_CPU_SIZE": "20"
}
```
### Integration Points
1. **Argument Parsing** (`args.py`):
- Detects `ENABLE_LMCACHE` environment variable
- Configures appropriate KV transfer settings
- Sets up connector configurations based on worker type
......
......@@ -23,7 +23,6 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
|---------------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
| `ENABLE_LMCACHE` | Sets Dynamo's recommended LMCache defaults (optional). | Not set | `ENABLE_LMCACHE=1` |
## Getting Started Quickly
......@@ -117,18 +116,12 @@ To access LMCache metrics, both of these are required:
1. `--connector lmcache` - Enables LMCache in vLLM
2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
**Minimal example:**
**Example:**
```bash
DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
```
**Recommended (with Dynamo defaults):**
```bash
DYN_SYSTEM_PORT=8081 ENABLE_LMCACHE=1 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
```
### Viewing LMCache Metrics
```bash
......
......@@ -9,8 +9,4 @@ python -m dynamo.frontend --http-port=8000 &
# run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
......@@ -16,10 +16,6 @@ sleep 20
# run prefill worker on GPU 1 with LMCache
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
......
......@@ -5,8 +5,8 @@ Test the correctness of Dynamo integration with LMCache by comparing MMLU benchm
## Testing Principle
Compare MMLU test results under two configurations:
- **Baseline Test**: Dynamo without LMCache (`ENABLE_LMCACHE=0`)
- **LMCache Test**: Dynamo with LMCache enabled (`ENABLE_LMCACHE=1`)
- **Baseline Test**: Dynamo without LMCache
- **LMCache Test**: Dynamo with LMCache enabled
If both configurations produce the same inference results, it verifies that LMCache functionality is correct.
......@@ -63,14 +63,12 @@ python3 summarize_scores_dynamo.py
### Baseline Architecture (deploy-baseline-dynamo.sh)
```
HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → Direct Inference
Environment: ENABLE_LMCACHE=0
```
### LMCache Architecture (deploy-lmcache_enabled-dynamo.sh)
```
HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → LMCache-enabled Inference
Environment: ENABLE_LMCACHE=1
LMCACHE_CHUNK_SIZE=256
Environment:LMCACHE_CHUNK_SIZE=256
LMCACHE_LOCAL_CPU=True
LMCACHE_MAX_LOCAL_CPU_SIZE=1.0
```
......
......@@ -30,8 +30,6 @@ echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
# Disable LMCache
export ENABLE_LMCACHE=0
echo "🔧 Starting dynamo disaggregated serving without LMCache..."
python -m dynamo.frontend &
......
......@@ -28,10 +28,7 @@ echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
# Disable LMCache
export ENABLE_LMCACHE=0
echo "🔧 Starting dynamo worker without LMCache..."
python -m dynamo.frontend &
python3 -m dynamo.vllm --model $MODEL_URL
\ No newline at end of file
......@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL&
sleep 20
# run prefill worker on GPU 1 with LMCache
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
......
......@@ -32,5 +32,5 @@ sleep 2
echo "🔧 Starting dynamo worker with LMCache enabled..."
python -m dynamo.frontend &
ENABLE_LMCACHE=1 \
python3 -m dynamo.vllm --model $MODEL_URL
\ No newline at end of file
python3 -m dynamo.vllm --model $MODEL_URL --connector lmcache
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment