Unverified Commit 75bf7c9b authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: expose LMCache metrics via Dynamo metrics endpoint (#4461)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 5dafe10b
......@@ -30,7 +30,7 @@ if TYPE_CHECKING:
def register_engine_metrics_callback(
endpoint: Endpoint,
registry: "CollectorRegistry",
metric_prefix_filter: Optional[str] = None,
metric_prefix_filters: Optional[list[str]] = None,
exclude_prefixes: Optional[list[str]] = None,
add_prefix: Optional[str] = None,
) -> None:
......@@ -43,14 +43,19 @@ def register_engine_metrics_callback(
Args:
endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback()
registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
metric_prefix_filters: List of prefixes to filter metrics (e.g., ["vllm:"], ["vllm:", "lmcache:"], or None for no filtering)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
Example:
from prometheus_client import REGISTRY
register_engine_metrics_callback(
generate_endpoint, REGISTRY, metric_prefix_filter="vllm:"
generate_endpoint, REGISTRY, metric_prefix_filters=["vllm:"]
)
# Include multiple metric prefixes
register_engine_metrics_callback(
generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"]
)
# With filtering and prefixing for TensorRT-LLM
......@@ -65,7 +70,7 @@ def register_engine_metrics_callback(
"""Callback to return engine Prometheus metrics in exposition format"""
return get_prometheus_expfmt(
registry,
metric_prefix_filter=metric_prefix_filter,
metric_prefix_filters=metric_prefix_filters,
exclude_prefixes=exclude_prefixes,
add_prefix=add_prefix,
)
......@@ -85,10 +90,15 @@ def _compile_exclude_pattern(exclude_prefixes: tuple[str, ...]) -> Pattern:
@lru_cache(maxsize=64)
def _compile_include_pattern(metric_prefix: str) -> Pattern:
"""Compile and cache regex for including metrics by prefix."""
escaped_prefix = re.escape(metric_prefix)
return re.compile(rf"^(# (HELP|TYPE) )?{escaped_prefix}")
def _compile_include_pattern(metric_prefixes: tuple[str, ...]) -> Pattern:
"""Compile and cache regex for including metrics by prefix.
Args take tuple not list - lru_cache requires hashable args (tuples are hashable, lists are not).
Supports multiple prefixes with OR logic (e.g., ("vllm:", "lmcache:")).
"""
escaped_prefixes = [re.escape(prefix) for prefix in metric_prefixes]
prefixes_regex = "|".join(escaped_prefixes)
return re.compile(rf"^(# (HELP|TYPE) )?({prefixes_regex})")
@lru_cache(maxsize=128)
......@@ -99,7 +109,7 @@ def _compile_help_type_pattern() -> Pattern:
def get_prometheus_expfmt(
registry,
metric_prefix_filter: Optional[str] = None,
metric_prefix_filters: Optional[list[str]] = None,
exclude_prefixes: Optional[list[str]] = None,
add_prefix: Optional[str] = None,
) -> str:
......@@ -113,8 +123,8 @@ def get_prometheus_expfmt(
registry: Prometheus registry to collect from.
Pass CollectorRegistry with MultiProcessCollector for SGLang.
Pass REGISTRY for vLLM single-process mode.
metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
If None, returns all metrics. (default: None)
metric_prefix_filters: Optional list of prefixes to filter displayed metrics (e.g., ["vllm:"] or ["vllm:", "lmcache:"]).
If None, returns all metrics. Supports single string or list of strings. (default: None)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
......@@ -122,6 +132,9 @@ def get_prometheus_expfmt(
Formatted metrics text in Prometheus exposition format. Returns empty string on error.
Example:
# Filter to include only vllm and lmcache metrics
get_prometheus_expfmt(registry, metric_prefix_filters=["vllm:", "lmcache:"])
# Filter out python_/process_ metrics and add trtllm_ prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
"""
......@@ -129,7 +142,7 @@ def get_prometheus_expfmt(
# Generate metrics in Prometheus text format
metrics_text = generate_latest(registry).decode("utf-8")
if metric_prefix_filter or exclude_prefixes or add_prefix:
if metric_prefix_filters or exclude_prefixes or add_prefix:
lines = []
# Get cached compiled patterns
......@@ -139,8 +152,9 @@ def get_prometheus_expfmt(
# Build include pattern if needed
include_pattern = None
if metric_prefix_filter:
include_pattern = _compile_include_pattern(metric_prefix_filter)
if metric_prefix_filters:
filter_tuple: tuple[str, ...] = tuple(metric_prefix_filters)
include_pattern = _compile_include_pattern(filter_tuple)
# Get cached HELP/TYPE pattern
help_type_pattern = _compile_help_type_pattern()
......@@ -165,10 +179,11 @@ def get_prometheus_expfmt(
if match:
comment_type, metric_name, rest = match.groups()
# Remove existing prefix if present
if metric_prefix_filter:
metric_name = metric_name.removeprefix(
metric_prefix_filter
)
if metric_prefix_filters:
for prefix in metric_prefix_filters:
if metric_name.startswith(prefix):
metric_name = metric_name.removeprefix(prefix)
break
# Only add prefix if it doesn't already exist
if not metric_name.startswith(add_prefix):
metric_name = add_prefix + metric_name
......@@ -182,10 +197,13 @@ def get_prometheus_expfmt(
rest_of_line = parts[1] if len(parts) > 1 else ""
# Remove existing prefix if present
if metric_prefix_filter:
metric_name_part = metric_name_part.removeprefix(
metric_prefix_filter
)
if metric_prefix_filters:
for prefix in metric_prefix_filters:
if metric_name_part.startswith(prefix):
metric_name_part = (
metric_name_part.removeprefix(prefix)
)
break
# Only add prefix if it doesn't already exist
if not metric_name_part.startswith(add_prefix):
......
......@@ -218,7 +218,7 @@ def setup_prometheus_registry(
register_engine_metrics_callback(
endpoint=generate_endpoint,
registry=registry,
metric_prefix_filter="sglang:",
metric_prefix_filters=["sglang:"],
)
return registry
......
......@@ -9,14 +9,12 @@ import tempfile
from typing import Optional
import uvloop
from prometheus_client import REGISTRY
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.llm import (
ModelInput,
ModelRuntimeConfig,
......@@ -193,8 +191,11 @@ def setup_kv_event_publisher(
def setup_vllm_engine(config, stat_logger=None):
# Set PROMETHEUS_MULTIPROC_DIR before setup to avoid vLLM v0.11.0 bug
# vllm/v1/metrics/prometheus.py:79 passes TemporaryDirectory object instead of .name
# Existing vLLM v0.11.0 bug: vllm/v1/metrics/prometheus.py:79 passes TemporaryDirectory object instead of
# the .name string, causing a false error message when vLLM exits. Therefore, always set
# PROMETHEUS_MULTIPROC_DIR first, and we'll do the path cleanup.
# This vLLM bug causes a false error message when vLLM exits.
prometheus_temp_dir = None
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
prometheus_temp_dir = tempfile.TemporaryDirectory(prefix="vllm_prometheus_")
......@@ -203,7 +204,7 @@ def setup_vllm_engine(config, stat_logger=None):
f"Created PROMETHEUS_MULTIPROC_DIR at: {os.environ['PROMETHEUS_MULTIPROC_DIR']}"
)
setup_multiprocess_prometheus()
setup_multiprocess_prometheus() # call vLLM's library's function to setup multiprocess prometheus
logger.debug(
f"Prometheus multiproc dir set to: {os.environ.get('PROMETHEUS_MULTIPROC_DIR')}"
)
......@@ -374,8 +375,29 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
handler.kv_publishers = kv_publishers
if config.engine_args.disable_log_stats is False:
# vLLM v1 registers its metrics with 'vllm:' prefix
from prometheus_client import REGISTRY, multiprocess
from dynamo.common.utils.prometheus import register_engine_metrics_callback
# Option 1: Try adding MultiProcessCollector to the global REGISTRY
# This would make REGISTRY collect from both its registered metrics AND multiprocess files
if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
try:
# Add MultiProcessCollector to global REGISTRY
# This makes REGISTRY collect from .db files in addition to its own metrics
multiprocess.MultiProcessCollector(REGISTRY)
logger.info("Added MultiProcessCollector to global REGISTRY")
except ValueError as e:
# Might already be registered or directory issues
logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
# Register callback with the global REGISTRY
# Now it should collect both its own metrics AND multiprocess metrics
register_engine_metrics_callback(
endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
endpoint=generate_endpoint,
registry=REGISTRY,
metric_prefix_filters=["vllm:", "lmcache:"],
)
# Register prefill model with ModelType.Prefill
......@@ -484,8 +506,29 @@ async def init(runtime: DistributedRuntime, config: Config):
handler.kv_publishers = kv_publishers
if config.engine_args.disable_log_stats is False:
# vLLM v1 registers its metrics with 'vllm:' prefix
from prometheus_client import REGISTRY, multiprocess
from dynamo.common.utils.prometheus import register_engine_metrics_callback
# Option 1: Try adding MultiProcessCollector to the global REGISTRY
# This would make REGISTRY collect from both its registered metrics AND multiprocess files
if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
try:
# Add MultiProcessCollector to global REGISTRY
# This makes REGISTRY collect from .db files in addition to its own metrics
multiprocess.MultiProcessCollector(REGISTRY)
logger.info("Added MultiProcessCollector to global REGISTRY")
except ValueError as e:
# Might already be registered or directory issues
logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
# Register callback with the global REGISTRY
# Now it should collect both its own metrics AND multiprocess metrics
register_engine_metrics_callback(
endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
endpoint=generate_endpoint,
registry=REGISTRY,
metric_prefix_filters=["vllm:", "lmcache:"],
)
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
......
......@@ -20,16 +20,43 @@ This document describes how LMCache is integrated into Dynamo's vLLM backend to
### Configuration
LMCache is enabled by setting the `ENABLE_LMCACHE` environment variable:
LMCache is enabled using the `--connector lmcache` flag:
```bash
export ENABLE_LMCACHE=1
python -m dynamo.vllm --model <model_name> --connector lmcache
```
Additional LMCache configuration can be customized via environment variables:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity (default: 256)
**The `--connector lmcache` flag is required** to enable LMCache in vLLM. Optionally set `ENABLE_LMCACHE=1` to use Dynamo's default LMCache configuration values, or set individual `LMCACHE_*` environment variables for custom configuration.
### Environment Variables
LMCache configuration can be customized via environment variables:
**Option 1: Use Dynamo Defaults (Recommended)**
```bash
export ENABLE_LMCACHE=1 # Sets Dynamo's recommended defaults
python -m dynamo.vllm --model <model_name> --connector lmcache
```
Dynamo sets these defaults when `ENABLE_LMCACHE=1`:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity
- `LMCACHE_LOCAL_CPU=True` - Enable CPU memory backend for offloading
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB (user can adjust based on available RAM to a fixed value)
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB
**Option 2: Set Individual Variables**
```bash
export LMCACHE_CHUNK_SIZE=256
export LMCACHE_LOCAL_CPU=True
export LMCACHE_MAX_LOCAL_CPU_SIZE=20
python -m dynamo.vllm --model <model_name> --connector lmcache
```
**Option 3: Use LMCache Defaults**
```bash
# Just use --connector lmcache without env vars
python -m dynamo.vllm --model <model_name> --connector lmcache
# LMCache will use its own defaults (chunk_size=256, local_cpu=True, max_local_cpu_size=5GB)
```
For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
- **CPU RAM**: Fast local memory offloading
......@@ -167,8 +194,19 @@ lmcache_config = {
- Shared context across sessions
- Long-running services with warm caches
## Metrics and Monitoring
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
**Requirements to access LMCache metrics:**
- `--connector lmcache` - Enables LMCache
- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide.
## References and Additional Resources
- [LMCache Documentation](https://docs.lmcache.ai/index.html) - Comprehensive guide and API reference
- [Configuration Reference](https://docs.lmcache.ai/api_reference/configurations.html) - Detailed configuration options
- [LMCache Observability Guide](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Metrics and monitoring details
......@@ -11,15 +11,19 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t
**For the complete and authoritative list of all vLLM metrics**, always refer to the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).
**For LMCache metrics and integration**, see the [LMCache Integration Guide](LMCache_Integration.md).
**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
## Environment Variables
## Environment Variables and Flags
| Variable | Description | Default | Example |
|----------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` |
| Variable/Flag | Description | Default | Example |
|---------------|-------------|---------|---------|
| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
| `ENABLE_LMCACHE` | Sets Dynamo's recommended LMCache defaults (optional). | Not set | `ENABLE_LMCACHE=1` |
## Getting Started Quickly
......@@ -103,12 +107,46 @@ The official vLLM documentation includes complete metric definitions with:
For the complete and authoritative list of all vLLM metrics, see the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).
## LMCache Metrics
When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
### Minimum Requirements
To access LMCache metrics, both of these are required:
1. `--connector lmcache` - Enables LMCache in vLLM
2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
**Minimal example:**
```bash
DYN_SYSTEM_PORT=8081 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
```
**Recommended (with Dynamo defaults):**
```bash
DYN_SYSTEM_PORT=8081 ENABLE_LMCACHE=1 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
```
### Viewing LMCache Metrics
```bash
# View all LMCache metrics
curl -s localhost:8081/metrics | grep "^lmcache:"
```
**For complete LMCache configuration and metric details**, see:
- [LMCache Integration Guide](LMCache_Integration.md) - Setup and configuration
- [LMCache Observability Documentation](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Complete metrics reference
## Implementation Details
- vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess`
- `PROMETHEUS_MULTIPROC_DIR`: vLLM sets this environment variable to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped.
- Metrics are filtered by the `vllm:` prefix before being exposed
- The integration uses Dynamo's `register_engine_metrics_callback()` function
- Dynamo uses `MultiProcessCollector` to aggregate metrics from all worker processes
- Metrics are filtered by the `vllm:` and `lmcache:` prefixes before being exposed (when LMCache is enabled)
- The integration uses Dynamo's `register_engine_metrics_callback()` function with the global `REGISTRY`
- Metrics appear after vLLM engine initialization completes
- vLLM v1 metrics are different from v0 - see the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for migration details
......
......@@ -8,6 +8,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
python -m dynamo.frontend --http-port=8000 &
# run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \
ENABLE_LMCACHE=1 \
LMCACHE_CHUNK_SIZE=256 \
LMCACHE_LOCAL_CPU=True \
......
......@@ -48,6 +48,19 @@ vllm_configs = {
metric_payload_default(min_num_requests=6, backend="vllm"),
],
),
"aggregated_lmcache": VLLMConfig(
name="aggregated_lmcache",
directory=vllm_dir,
script_name="agg_lmcache.sh",
marks=[pytest.mark.gpu_1],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
completion_payload_default(),
metric_payload_default(min_num_requests=6, backend="vllm"),
metric_payload_default(min_num_requests=6, backend="lmcache"),
],
),
"agg-request-plane-tcp": VLLMConfig(
name="agg-request-plane-tcp",
directory=vllm_dir,
......
......@@ -317,6 +317,19 @@ class MetricsPayload(BasePayload):
multiline=True,
)
)
elif backend == "lmcache":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique lmcache:* metrics
name="lmcache:*",
pattern=lambda name: r"^lmcache:\w+",
validator=lambda value: len(set(value))
>= 1, # At least 1 lmcache metric
error_msg=lambda name, value: f"Expected at least 1 lmcache:* metric, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} lmcache:* metrics",
multiline=True,
)
)
elif backend == "sglang":
metrics_to_check.append(
MetricCheck(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment