fix: expose LMCache metrics via Dynamo metrics endpoint (#4461)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

fix: expose LMCache metrics via Dynamo metrics endpoint (#4461)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
75bf7c9b · Keiven C · GitHub · 5dafe10b · 75bf7c9b · 75bf7c9b
Unverified Commit 75bf7c9b authored Nov 19, 2025 by Keiven C Committed by GitHub Nov 19, 2025
8 changed files
--- a/components/src/dynamo/common/utils/prometheus.py
+++ b/components/src/dynamo/common/utils/prometheus.py
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
 def register_engine_metrics_callback(
    endpoint: Endpoint,
    registry: "CollectorRegistry",
-    metric_prefix_filter: Optional[str] = None,
+    metric_prefix_filters: Optional[list[str]] = None,
    exclude_prefixes: Optional[list[str]] = None,
    add_prefix: Optional[str] = None,
 ) -> None:
@@ -43,14 +43,19 @@ def register_engine_metrics_callback(
    Args:
        endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback()
        registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
-        metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
+        metric_prefix_filters: List of prefixes to filter metrics (e.g., ["vllm:"], ["vllm:", "lmcache:"], or None for no filtering)
        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")

    Example:
        from prometheus_client import REGISTRY
        register_engine_metrics_callback(
-            generate_endpoint, REGISTRY, metric_prefix_filter="vllm:"
+            generate_endpoint, REGISTRY, metric_prefix_filters=["vllm:"]
+        )
+
+        # Include multiple metric prefixes
+        register_engine_metrics_callback(
+            generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"]
        )

        # With filtering and prefixing for TensorRT-LLM
@@ -65,7 +70,7 @@ def register_engine_metrics_callback(
        """Callback to return engine Prometheus metrics in exposition format"""
        return get_prometheus_expfmt(
            registry,
-            metric_prefix_filter=metric_prefix_filter,
+            metric_prefix_filters=metric_prefix_filters,
            exclude_prefixes=exclude_prefixes,
            add_prefix=add_prefix,
        )
@@ -85,10 +90,15 @@ def _compile_exclude_pattern(exclude_prefixes: tuple[str, ...]) -> Pattern:


 @lru_cache(maxsize=64)
-def _compile_include_pattern(metric_prefix: str) -> Pattern:
-    """Compile and cache regex for including metrics by prefix."""
-    escaped_prefix = re.escape(metric_prefix)
-    return re.compile(rf"^(# (HELP|TYPE) )?{escaped_prefix}")
+def _compile_include_pattern(metric_prefixes: tuple[str, ...]) -> Pattern:
+    """Compile and cache regex for including metrics by prefix.
+
+    Args take tuple not list - lru_cache requires hashable args (tuples are hashable, lists are not).
+    Supports multiple prefixes with OR logic (e.g., ("vllm:", "lmcache:")).
+    """
+    escaped_prefixes = [re.escape(prefix) for prefix in metric_prefixes]
+    prefixes_regex = "|".join(escaped_prefixes)
+    return re.compile(rf"^(# (HELP|TYPE) )?({prefixes_regex})")


 @lru_cache(maxsize=128)
@@ -99,7 +109,7 @@ def _compile_help_type_pattern() -> Pattern:

 def get_prometheus_expfmt(
    registry,
-    metric_prefix_filter: Optional[str] = None,
+    metric_prefix_filters: Optional[list[str]] = None,
    exclude_prefixes: Optional[list[str]] = None,
    add_prefix: Optional[str] = None,
 ) -> str:
@@ -113,8 +123,8 @@ def get_prometheus_expfmt(
        registry: Prometheus registry to collect from.
                 Pass CollectorRegistry with MultiProcessCollector for SGLang.
                 Pass REGISTRY for vLLM single-process mode.
-        metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
-                             If None, returns all metrics. (default: None)
+        metric_prefix_filters: Optional list of prefixes to filter displayed metrics (e.g., ["vllm:"] or ["vllm:", "lmcache:"]).
+                             If None, returns all metrics. Supports single string or list of strings. (default: None)
        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")

@@ -122,6 +132,9 @@ def get_prometheus_expfmt(
        Formatted metrics text in Prometheus exposition format. Returns empty string on error.

    Example:
+        # Filter to include only vllm and lmcache metrics
+        get_prometheus_expfmt(registry, metric_prefix_filters=["vllm:", "lmcache:"])
+
        # Filter out python_/process_ metrics and add trtllm_ prefix
        get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
    """
@@ -129,7 +142,7 @@ def get_prometheus_expfmt(
        # Generate metrics in Prometheus text format
        metrics_text = generate_latest(registry).decode("utf-8")

-        if metric_prefix_filter or exclude_prefixes or add_prefix:
+        if metric_prefix_filters or exclude_prefixes or add_prefix:
            lines = []

            # Get cached compiled patterns
@@ -139,8 +152,9 @@ def get_prometheus_expfmt(

            # Build include pattern if needed
            include_pattern = None
-            if metric_prefix_filter:
-                include_pattern = _compile_include_pattern(metric_prefix_filter)
+            if metric_prefix_filters:
+                filter_tuple: tuple[str, ...] = tuple(metric_prefix_filters)
+                include_pattern = _compile_include_pattern(filter_tuple)

            # Get cached HELP/TYPE pattern
            help_type_pattern = _compile_help_type_pattern()
@@ -165,10 +179,11 @@ def get_prometheus_expfmt(
                        if match:
                            comment_type, metric_name, rest = match.groups()
                            # Remove existing prefix if present
-                            if metric_prefix_filter:
-                                metric_name = metric_name.removeprefix(
-                                    metric_prefix_filter
-                                )
+                            if metric_prefix_filters:
+                                for prefix in metric_prefix_filters:
+                                    if metric_name.startswith(prefix):
+                                        metric_name = metric_name.removeprefix(prefix)
+                                        break
                            # Only add prefix if it doesn't already exist
                            if not metric_name.startswith(add_prefix):
                                metric_name = add_prefix + metric_name
@@ -182,10 +197,13 @@ def get_prometheus_expfmt(
                            rest_of_line = parts[1] if len(parts) > 1 else ""

                            # Remove existing prefix if present
-                            if metric_prefix_filter:
-                                metric_name_part = metric_name_part.removeprefix(
-                                    metric_prefix_filter
-                                )
+                            if metric_prefix_filters:
+                                for prefix in metric_prefix_filters:
+                                    if metric_name_part.startswith(prefix):
+                                        metric_name_part = (
+                                            metric_name_part.removeprefix(prefix)
+                                        )
+                                        break

                            # Only add prefix if it doesn't already exist
                            if not metric_name_part.startswith(add_prefix):

--- a/components/src/dynamo/sglang/publisher.py
+++ b/components/src/dynamo/sglang/publisher.py
@@ -218,7 +218,7 @@ def setup_prometheus_registry(
    register_engine_metrics_callback(
        endpoint=generate_endpoint,
        registry=registry,
-        metric_prefix_filter="sglang:",
+        metric_prefix_filters=["sglang:"],
    )
    return registry


--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -9,14 +9,12 @@ import tempfile
 from typing import Optional

 import uvloop
-from prometheus_client import REGISTRY
 from vllm.distributed.kv_events import ZmqEventPublisher
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus

 from dynamo.common.config_dump import dump_config
-from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import (
    ModelInput,
    ModelRuntimeConfig,
@@ -193,8 +191,11 @@ def setup_kv_event_publisher(


 def setup_vllm_engine(config, stat_logger=None):
-    # Set PROMETHEUS_MULTIPROC_DIR before setup to avoid vLLM v0.11.0 bug
-    # vllm/v1/metrics/prometheus.py:79 passes TemporaryDirectory object instead of .name
+    # Existing vLLM v0.11.0 bug: vllm/v1/metrics/prometheus.py:79 passes TemporaryDirectory object instead of
+    # the .name string, causing a false error message when vLLM exits. Therefore, always set
+    # PROMETHEUS_MULTIPROC_DIR first, and we'll do the path cleanup.
+
+    # This vLLM bug causes a false error message when vLLM exits.
    prometheus_temp_dir = None
    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
        prometheus_temp_dir = tempfile.TemporaryDirectory(prefix="vllm_prometheus_")
@@ -203,7 +204,7 @@ def setup_vllm_engine(config, stat_logger=None):
            f"Created PROMETHEUS_MULTIPROC_DIR at: {os.environ['PROMETHEUS_MULTIPROC_DIR']}"
        )

-    setup_multiprocess_prometheus()
+    setup_multiprocess_prometheus()  # call vLLM's library's function to setup multiprocess prometheus
    logger.debug(
        f"Prometheus multiproc dir set to: {os.environ.get('PROMETHEUS_MULTIPROC_DIR')}"
    )
@@ -374,8 +375,29 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
        handler.kv_publishers = kv_publishers

    if config.engine_args.disable_log_stats is False:
+        # vLLM v1 registers its metrics with 'vllm:' prefix
+        from prometheus_client import REGISTRY, multiprocess
+
+        from dynamo.common.utils.prometheus import register_engine_metrics_callback
+
+        # Option 1: Try adding MultiProcessCollector to the global REGISTRY
+        # This would make REGISTRY collect from both its registered metrics AND multiprocess files
+        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
+            try:
+                # Add MultiProcessCollector to global REGISTRY
+                # This makes REGISTRY collect from .db files in addition to its own metrics
+                multiprocess.MultiProcessCollector(REGISTRY)
+                logger.info("Added MultiProcessCollector to global REGISTRY")
+            except ValueError as e:
+                # Might already be registered or directory issues
+                logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
+
+        # Register callback with the global REGISTRY
+        # Now it should collect both its own metrics AND multiprocess metrics
        register_engine_metrics_callback(
-            endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
+            endpoint=generate_endpoint,
+            registry=REGISTRY,
+            metric_prefix_filters=["vllm:", "lmcache:"],
        )

    # Register prefill model with ModelType.Prefill
@@ -484,8 +506,29 @@ async def init(runtime: DistributedRuntime, config: Config):
        handler.kv_publishers = kv_publishers

    if config.engine_args.disable_log_stats is False:
+        # vLLM v1 registers its metrics with 'vllm:' prefix
+        from prometheus_client import REGISTRY, multiprocess
+
+        from dynamo.common.utils.prometheus import register_engine_metrics_callback
+
+        # Option 1: Try adding MultiProcessCollector to the global REGISTRY
+        # This would make REGISTRY collect from both its registered metrics AND multiprocess files
+        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
+            try:
+                # Add MultiProcessCollector to global REGISTRY
+                # This makes REGISTRY collect from .db files in addition to its own metrics
+                multiprocess.MultiProcessCollector(REGISTRY)
+                logger.info("Added MultiProcessCollector to global REGISTRY")
+            except ValueError as e:
+                # Might already be registered or directory issues
+                logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
+
+        # Register callback with the global REGISTRY
+        # Now it should collect both its own metrics AND multiprocess metrics
        register_engine_metrics_callback(
-            endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
+            endpoint=generate_endpoint,
+            registry=REGISTRY,
+            metric_prefix_filters=["vllm:", "lmcache:"],
        )

    if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register

--- a/docs/backends/vllm/LMCache_Integration.md
+++ b/docs/backends/vllm/LMCache_Integration.md
@@ -20,16 +20,43 @@ This document describes how LMCache is integrated into Dynamo's vLLM backend to

 ### Configuration

-LMCache is enabled by setting the `ENABLE_LMCACHE` environment variable:
+LMCache is enabled using the `--connector lmcache` flag:

 ```bash
-export ENABLE_LMCACHE=1
+python -m dynamo.vllm --model <model_name> --connector lmcache
 ```

-Additional LMCache configuration can be customized via environment variables:
- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity (default: 256)
+**The `--connector lmcache` flag is required** to enable LMCache in vLLM. Optionally set `ENABLE_LMCACHE=1` to use Dynamo's default LMCache configuration values, or set individual `LMCACHE_*` environment variables for custom configuration.
+
+### Environment Variables
+
+LMCache configuration can be customized via environment variables:
+
+**Option 1: Use Dynamo Defaults (Recommended)**
+```bash
+export ENABLE_LMCACHE=1  # Sets Dynamo's recommended defaults
+python -m dynamo.vllm --model <model_name> --connector lmcache
+```
+
+Dynamo sets these defaults when `ENABLE_LMCACHE=1`:
+- `LMCACHE_CHUNK_SIZE=256` - Token chunk size for cache granularity
 - `LMCACHE_LOCAL_CPU=True` - Enable CPU memory backend for offloading
- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB (user can adjust based on available RAM to a fixed value)
+- `LMCACHE_MAX_LOCAL_CPU_SIZE=20` - CPU memory limit in GB
+
+**Option 2: Set Individual Variables**
+```bash
+export LMCACHE_CHUNK_SIZE=256
+export LMCACHE_LOCAL_CPU=True
+export LMCACHE_MAX_LOCAL_CPU_SIZE=20
+python -m dynamo.vllm --model <model_name> --connector lmcache
+```
+
+**Option 3: Use LMCache Defaults**
+```bash
+# Just use --connector lmcache without env vars
+python -m dynamo.vllm --model <model_name> --connector lmcache
+# LMCache will use its own defaults (chunk_size=256, local_cpu=True, max_local_cpu_size=5GB)
+```

 For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
 - **CPU RAM**: Fast local memory offloading
@@ -167,8 +194,19 @@ lmcache_config = {
   - Shared context across sessions
   - Long-running services with warm caches

+## Metrics and Monitoring
+
+When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
+
+**Requirements to access LMCache metrics:**
+- `--connector lmcache` - Enables LMCache
+- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
+
+For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide.
+
 ## References and Additional Resources

 - [LMCache Documentation](https://docs.lmcache.ai/index.html) - Comprehensive guide and API reference
 - [Configuration Reference](https://docs.lmcache.ai/api_reference/configurations.html) - Detailed configuration options
+- [LMCache Observability Guide](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Metrics and monitoring details

--- a/docs/backends/vllm/prometheus.md
+++ b/docs/backends/vllm/prometheus.md
@@ -11,15 +11,19 @@ When running vLLM through Dynamo, vLLM engine metrics are automatically passed t

 **For the complete and authoritative list of all vLLM metrics**, always refer to the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).

+**For LMCache metrics and integration**, see the [LMCache Integration Guide](LMCache_Integration.md).
+
 **For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).

 **For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).

-## Environment Variables
+## Environment Variables and Flags

-| Variable | Description | Default | Example |
-|----------|-------------|---------|---------|
-| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` |
+| Variable/Flag | Description | Default | Example |
+|---------------|-------------|---------|---------|
+| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
+| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
+| `ENABLE_LMCACHE` | Sets Dynamo's recommended LMCache defaults (optional). | Not set | `ENABLE_LMCACHE=1` |

 ## Getting Started Quickly

@@ -103,12 +107,46 @@ The official vLLM documentation includes complete metric definitions with:

 For the complete and authoritative list of all vLLM metrics, see the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).

+## LMCache Metrics
+
+When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
+
+### Minimum Requirements
+
+To access LMCache metrics, both of these are required:
+1. `--connector lmcache` - Enables LMCache in vLLM
+2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
+
+**Minimal example:**
+```bash
+DYN_SYSTEM_PORT=8081 \
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
+```
+
+**Recommended (with Dynamo defaults):**
+```bash
+DYN_SYSTEM_PORT=8081 ENABLE_LMCACHE=1 \
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
+```
+
+### Viewing LMCache Metrics
+
+```bash
+# View all LMCache metrics
+curl -s localhost:8081/metrics | grep "^lmcache:"
+```
+
+**For complete LMCache configuration and metric details**, see:
+- [LMCache Integration Guide](LMCache_Integration.md) - Setup and configuration
+- [LMCache Observability Documentation](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Complete metrics reference
+
 ## Implementation Details

 - vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess`
 - `PROMETHEUS_MULTIPROC_DIR`: vLLM sets this environment variable to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped.
- Metrics are filtered by the `vllm:` prefix before being exposed
- The integration uses Dynamo's `register_engine_metrics_callback()` function
+- Dynamo uses `MultiProcessCollector` to aggregate metrics from all worker processes
+- Metrics are filtered by the `vllm:` and `lmcache:` prefixes before being exposed (when LMCache is enabled)
+- The integration uses Dynamo's `register_engine_metrics_callback()` function with the global `REGISTRY`
 - Metrics appear after vLLM engine initialization completes
 - vLLM v1 metrics are different from v0 - see the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for migration details


--- a/examples/backends/vllm/launch/agg_lmcache.sh
+++ b/examples/backends/vllm/launch/agg_lmcache.sh
@@ -8,6 +8,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
 python -m dynamo.frontend --http-port=8000 &

 # run worker with LMCache enabled
+DYN_SYSTEM_PORT=8081 \
 ENABLE_LMCACHE=1 \
 LMCACHE_CHUNK_SIZE=256 \
 LMCACHE_LOCAL_CPU=True \

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -48,6 +48,19 @@ vllm_configs = {
            metric_payload_default(min_num_requests=6, backend="vllm"),
        ],
    ),
+    "aggregated_lmcache": VLLMConfig(
+        name="aggregated_lmcache",
+        directory=vllm_dir,
+        script_name="agg_lmcache.sh",
+        marks=[pytest.mark.gpu_1],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
+            metric_payload_default(min_num_requests=6, backend="lmcache"),
+        ],
+    ),
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,

--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -317,6 +317,19 @@ class MetricsPayload(BasePayload):
                    multiline=True,
                )
            )
+        elif backend == "lmcache":
+            metrics_to_check.append(
+                MetricCheck(
+                    # Check: Minimum count of unique lmcache:* metrics
+                    name="lmcache:*",
+                    pattern=lambda name: r"^lmcache:\w+",
+                    validator=lambda value: len(set(value))
+                    >= 1,  # At least 1 lmcache metric
+                    error_msg=lambda name, value: f"Expected at least 1 lmcache:* metric, but found only {len(set(value))}",
+                    success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} lmcache:* metrics",
+                    multiline=True,
+                )
+            )
        elif backend == "sglang":
            metrics_to_check.append(
                MetricCheck(