Unverified Commit 6d69e8c7 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

chore: unify LMCache UX w/ and w/o Dynamo (#4509)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent b1286e7f
...@@ -26,9 +26,6 @@ logger = logging.getLogger(__name__) ...@@ -26,9 +26,6 @@ logger = logging.getLogger(__name__)
DEFAULT_MODEL = "Qwen/Qwen3-0.6B" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"} VALID_CONNECTORS = {"nixl", "lmcache", "kvbm", "null", "none"}
# Global LMCache configuration - initialize once on module import
ENABLE_LMCACHE = os.getenv("ENABLE_LMCACHE", "0").lower() in ("1", "true", "yes")
class Config: class Config:
"""Command line parameters or defaults""" """Command line parameters or defaults"""
......
...@@ -33,7 +33,7 @@ from dynamo.vllm.multimodal_handlers import ( ...@@ -33,7 +33,7 @@ from dynamo.vllm.multimodal_handlers import (
ProcessorHandler, ProcessorHandler,
) )
from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args from .args import Config, overwrite_args, parse_args
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .publisher import StatLoggerFactory from .publisher import StatLoggerFactory
...@@ -42,22 +42,6 @@ configure_dynamo_logging() ...@@ -42,22 +42,6 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def setup_lmcache_environment():
"""Setup LMCache environment variables for KV cache offloading"""
# LMCache configuration for matching logic
lmcache_config = {
"LMCACHE_CHUNK_SIZE": "256", # Token chunk size
"LMCACHE_LOCAL_CPU": "True", # Enable CPU memory backend
"LMCACHE_MAX_LOCAL_CPU_SIZE": "20", # CPU memory limit in GB
}
# Set environment variables
for key, value in lmcache_config.items():
if key not in os.environ: # Only set if not already configured
os.environ[key] = value
logger.info(f"Set LMCache environment variable: {key}={value}")
async def graceful_shutdown(runtime): async def graceful_shutdown(runtime):
""" """
Shutdown dynamo distributed runtime. Shutdown dynamo distributed runtime.
...@@ -214,13 +198,6 @@ def setup_vllm_engine(config, stat_logger=None): ...@@ -214,13 +198,6 @@ def setup_vllm_engine(config, stat_logger=None):
engine_args = config.engine_args engine_args = config.engine_args
# KV transfer config is now handled by args.py based on ENABLE_LMCACHE env var
if ENABLE_LMCACHE:
setup_lmcache_environment()
logger.info("LMCache enabled for VllmWorker")
else:
logger.debug("LMCache is disabled")
# Load default sampling params from `generation_config.json` # Load default sampling params from `generation_config.json`
default_sampling_params = ( default_sampling_params = (
engine_args.create_model_config().get_diff_sampling_param() engine_args.create_model_config().get_diff_sampling_param()
...@@ -258,12 +235,8 @@ def setup_vllm_engine(config, stat_logger=None): ...@@ -258,12 +235,8 @@ def setup_vllm_engine(config, stat_logger=None):
disable_log_requests=engine_args.disable_log_requests, disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats, disable_log_stats=engine_args.disable_log_stats,
) )
if ENABLE_LMCACHE:
logger.info( logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
f"VllmWorker for {config.served_model_name} has been initialized with LMCache"
)
else:
logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
return engine_client, vllm_config, default_sampling_params, prometheus_temp_dir return engine_client, vllm_config, default_sampling_params, prometheus_temp_dir
......
...@@ -26,37 +26,9 @@ LMCache is enabled using the `--connector lmcache` flag: ...@@ -26,37 +26,9 @@ LMCache is enabled using the `--connector lmcache` flag:
python -m dynamo.vllm --model <model_name> --connector lmcache python -m dynamo.vllm --model <model_name> --connector lmcache
``` ```
**The `--connector lmcache` flag is required** to enable LMCache in vLLM. Optionally set `ENABLE_LMCACHE=1` to use Dynamo's default LMCache configuration values, or set individual `LMCACHE_*` environment variables for custom configuration. ### Customization
### Environment Variables LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html).
LMCache configuration can be customized via environment variables:
**Option 1: Use Dynamo Defaults (Recommended)**
```bash
export ENABLE_LMCACHE=1 # Sets Dynamo's recommended defaults
python -m dynamo.vllm --model <model_name> --connector lmcache
```
Dynamo sets these defaults when `ENABLE_LMCACHE=1`:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity
- `LMCACHE_LOCAL_CPU=True` - Enable CPU memory backend for offloading
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB
**Option 2: Set Individual Variables**
```bash
export LMCACHE_CHUNK_SIZE=256
export LMCACHE_LOCAL_CPU=True
export LMCACHE_MAX_LOCAL_CPU_SIZE=20
python -m dynamo.vllm --model <model_name> --connector lmcache
```
**Option 3: Use LMCache Defaults**
```bash
# Just use --connector lmcache without env vars
python -m dynamo.vllm --model <model_name> --connector lmcache
# LMCache will use its own defaults (chunk_size=256, local_cpu=True, max_local_cpu_size=5GB)
```
For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html): For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
- **CPU RAM**: Fast local memory offloading - **CPU RAM**: Fast local memory offloading
...@@ -87,10 +59,6 @@ In aggregated mode, the system uses: ...@@ -87,10 +59,6 @@ In aggregated mode, the system uses:
Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments. Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments.
### Configuration
The same `ENABLE_LMCACHE=1` environment variable enables LMCache, but the system automatically configures different connector setups for prefill and decode workers.
### Deployment ### Deployment
Use the provided disaggregated launch script(the script requires at least 2 GPUs): Use the provided disaggregated launch script(the script requires at least 2 GPUs):
...@@ -127,7 +95,7 @@ The system automatically configures KV transfer based on the deployment mode and ...@@ -127,7 +95,7 @@ The system automatically configures KV transfer based on the deployment mode and
#### Prefill Worker (Disaggregated Mode) #### Prefill Worker (Disaggregated Mode)
```python ```python
kv_transfer_config = KVTransferConfig( kv_transfer_config = KVTransferConfig(
kv_connector="MultiConnector", kv_connector="PdConnector",
kv_role="kv_both", kv_role="kv_both",
kv_connector_extra_config={ kv_connector_extra_config={
"connectors": [ "connectors": [
...@@ -154,22 +122,9 @@ kv_transfer_config = KVTransferConfig( ...@@ -154,22 +122,9 @@ kv_transfer_config = KVTransferConfig(
) )
``` ```
### Environment Setup
The system automatically configures LMCache environment variables when enabled:
```python
lmcache_config = {
"LMCACHE_CHUNK_SIZE": "256",
"LMCACHE_LOCAL_CPU": "True",
"LMCACHE_MAX_LOCAL_CPU_SIZE": "20"
}
```
### Integration Points ### Integration Points
1. **Argument Parsing** (`args.py`): 1. **Argument Parsing** (`args.py`):
- Detects `ENABLE_LMCACHE` environment variable
- Configures appropriate KV transfer settings - Configures appropriate KV transfer settings
- Sets up connector configurations based on worker type - Sets up connector configurations based on worker type
......
...@@ -23,7 +23,6 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t ...@@ -23,7 +23,6 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
|---------------|-------------|---------|---------| |---------------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` | | `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` | | `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
| `ENABLE_LMCACHE` | Sets Dynamo's recommended LMCache defaults (optional). | Not set | `ENABLE_LMCACHE=1` |
## Getting Started Quickly ## Getting Started Quickly
...@@ -117,18 +116,12 @@ To access LMCache metrics, both of these are required: ...@@ -117,18 +116,12 @@ To access LMCache metrics, both of these are required:
1. `--connector lmcache` - Enables LMCache in vLLM 1. `--connector lmcache` - Enables LMCache in vLLM
2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint 2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
**Minimal example:** **Example:**
```bash ```bash
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
``` ```
**Recommended (with Dynamo defaults):**
```bash
DYN_SYSTEM_PORT=8081 ENABLE_LMCACHE=1 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
```
### Viewing LMCache Metrics ### Viewing LMCache Metrics
```bash ```bash
......
...@@ -9,8 +9,4 @@ python -m dynamo.frontend --http-port=8000 & ...@@ -9,8 +9,4 @@ python -m dynamo.frontend --http-port=8000 &
# run worker with LMCache enabled # run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_PORT=8081 \
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
...@@ -16,10 +16,6 @@ sleep 20 ...@@ -16,10 +16,6 @@ sleep 20
# run prefill worker on GPU 1 with LMCache # run prefill worker on GPU 1 with LMCache
DYN_VLLM_KV_EVENT_PORT=20081 \ DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
......
...@@ -5,8 +5,8 @@ Test the correctness of Dynamo integration with LMCache by comparing MMLU benchm ...@@ -5,8 +5,8 @@ Test the correctness of Dynamo integration with LMCache by comparing MMLU benchm
## Testing Principle ## Testing Principle
Compare MMLU test results under two configurations: Compare MMLU test results under two configurations:
- **Baseline Test**: Dynamo without LMCache (`ENABLE_LMCACHE=0`) - **Baseline Test**: Dynamo without LMCache
- **LMCache Test**: Dynamo with LMCache enabled (`ENABLE_LMCACHE=1`) - **LMCache Test**: Dynamo with LMCache enabled
If both configurations produce the same inference results, it verifies that LMCache functionality is correct. If both configurations produce the same inference results, it verifies that LMCache functionality is correct.
...@@ -63,14 +63,12 @@ python3 summarize_scores_dynamo.py ...@@ -63,14 +63,12 @@ python3 summarize_scores_dynamo.py
### Baseline Architecture (deploy-baseline-dynamo.sh) ### Baseline Architecture (deploy-baseline-dynamo.sh)
``` ```
HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → Direct Inference HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → Direct Inference
Environment: ENABLE_LMCACHE=0
``` ```
### LMCache Architecture (deploy-lmcache_enabled-dynamo.sh) ### LMCache Architecture (deploy-lmcache_enabled-dynamo.sh)
``` ```
HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → LMCache-enabled Inference HTTP Request → Dynamo Ingress(8000) → Dynamo Worker → LMCache-enabled Inference
Environment: ENABLE_LMCACHE=1 Environment:LMCACHE_CHUNK_SIZE=256
LMCACHE_CHUNK_SIZE=256
LMCACHE_LOCAL_CPU=True LMCACHE_LOCAL_CPU=True
LMCACHE_MAX_LOCAL_CPU_SIZE=1.0 LMCACHE_MAX_LOCAL_CPU_SIZE=1.0
``` ```
......
...@@ -30,8 +30,6 @@ echo "🧹 Cleaning up any existing dynamo processes..." ...@@ -30,8 +30,6 @@ echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true pkill -f "dynamo-run" || true
sleep 2 sleep 2
# Disable LMCache
export ENABLE_LMCACHE=0
echo "🔧 Starting dynamo disaggregated serving without LMCache..." echo "🔧 Starting dynamo disaggregated serving without LMCache..."
python -m dynamo.frontend & python -m dynamo.frontend &
......
...@@ -28,10 +28,7 @@ echo "🧹 Cleaning up any existing dynamo processes..." ...@@ -28,10 +28,7 @@ echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true pkill -f "dynamo-run" || true
sleep 2 sleep 2
# Disable LMCache
export ENABLE_LMCACHE=0
echo "🔧 Starting dynamo worker without LMCache..." echo "🔧 Starting dynamo worker without LMCache..."
python -m dynamo.frontend & python -m dynamo.frontend &
python3 -m dynamo.vllm --model $MODEL_URL python3 -m dynamo.vllm --model $MODEL_URL
\ No newline at end of file
...@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL& ...@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL&
sleep 20 sleep 20
# run prefill worker on GPU 1 with LMCache # run prefill worker on GPU 1 with LMCache
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \ LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \ LMCACHE_LOCAL_CPU=True \
LMCACHE_MAX_LOCAL_CPU_SIZE=20 \ LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
......
...@@ -32,5 +32,5 @@ sleep 2 ...@@ -32,5 +32,5 @@ sleep 2
echo "🔧 Starting dynamo worker with LMCache enabled..." echo "🔧 Starting dynamo worker with LMCache enabled..."
python -m dynamo.frontend & python -m dynamo.frontend &
ENABLE_LMCACHE=1 \
python3 -m dynamo.vllm --model $MODEL_URL python3 -m dynamo.vllm --model $MODEL_URL --connector lmcache
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment