fix(sglang): expose TokenizerMetricsCollector metrics via Prometheus (#5120)

3e341fd6 · ishandhanani · GitHub · 0980b27f · 3e341fd6 · 3e341fd6
Unverified Commit 3e341fd6 authored Jan 03, 2026 by ishandhanani Committed by GitHub Jan 03, 2026
5 changed files
--- a/components/src/dynamo/common/utils/prometheus.py
+++ b/components/src/dynamo/common/utils/prometheus.py
@@ -16,8 +16,6 @@ import re
 from functools import lru_cache
 from typing import TYPE_CHECKING, Optional, Pattern

-from prometheus_client import generate_latest
-
 from dynamo._core import Endpoint

 # Import CollectorRegistry only for type hints to avoid importing prometheus_client at module load time.
@@ -119,6 +117,11 @@ def get_prometheus_expfmt(
    Collects all metrics from the registry and returns them in Prometheus text exposition format.
    Optionally filters metrics by prefix, excludes certain prefixes, and adds a prefix.

+    IMPORTANT: prometheus_client is imported lazily here because it must be imported AFTER
+    set_prometheus_multiproc_dir() is called by SGLang's engine initialization. Importing
+    at module level causes prometheus_client to initialize in single-process mode before
+    PROMETHEUS_MULTIPROC_DIR is set, which breaks TokenizerMetricsCollector metrics.
+
    Args:
        registry: Prometheus registry to collect from.
                 Pass CollectorRegistry with MultiProcessCollector for SGLang.
@@ -138,6 +141,8 @@ def get_prometheus_expfmt(
        # Filter out python_/process_ metrics and add trtllm_ prefix
        get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
    """
+    from prometheus_client import generate_latest
+
    try:
        # Generate metrics in Prometheus text format
        metrics_text = generate_latest(registry).decode("utf-8")

--- a/components/src/dynamo/sglang/publisher.py
+++ b/components/src/dynamo/sglang/publisher.py
@@ -4,14 +4,16 @@
 import asyncio
 import json
 import logging
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple

 import sglang as sgl
 import zmq
 import zmq.asyncio
-from prometheus_client import CollectorRegistry, multiprocess
 from sglang.srt.utils import get_local_ip_auto, get_zmq_socket, maybe_wrap_ipv6_address

+if TYPE_CHECKING:
+    from prometheus_client import CollectorRegistry
+
 from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import (
    ForwardPassMetrics,
@@ -224,7 +226,7 @@ class DynamoSglangPublisher:

 def setup_prometheus_registry(
    engine: sgl.Engine, generate_endpoint: Endpoint
-) -> CollectorRegistry:
+) -> "CollectorRegistry":
    """Set up Prometheus registry for SGLang metrics collection.

    SGLang uses multiprocess architecture where metrics are stored in shared memory.
@@ -232,6 +234,11 @@ def setup_prometheus_registry(
    registry collects sglang:* metrics which are exposed via the metrics server endpoint
    (set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081).

+    IMPORTANT: prometheus_client must be imported AFTER sgl.Engine() has called
+    set_prometheus_multiproc_dir(). Importing at module level causes prometheus_client
+    to initialize in single-process mode before PROMETHEUS_MULTIPROC_DIR is set,
+    which breaks TokenizerMetricsCollector metrics (TTFT, ITL, e2e latency, etc.).
+
    Args:
        engine: The SGLang engine instance.
        generate_endpoint: The Dynamo endpoint for generation requests.
@@ -239,6 +246,8 @@ def setup_prometheus_registry(
    Returns:
        Configured CollectorRegistry with multiprocess support.
    """
+    from prometheus_client import CollectorRegistry, multiprocess
+
    registry = CollectorRegistry()
    multiprocess.MultiProcessCollector(registry)
    register_engine_metrics_callback(

--- a/components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
@@ -3,7 +3,7 @@

 """Unit tests for Prometheus utilities."""

-from unittest.mock import Mock
+from unittest.mock import Mock, patch

 import pytest

@@ -21,12 +21,7 @@ pytestmark = [
 class TestGetPrometheusExpfmt:
    """Test class for get_prometheus_expfmt function."""

-    @pytest.fixture
-    def sglang_registry(self):
-        """Create a mock registry with SGLang-style metrics."""
-        registry = Mock()
-
-        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+    SAMPLE_METRICS = """# HELP python_gc_objects_collected_total Objects collected during gc
 # TYPE python_gc_objects_collected_total counter
 python_gc_objects_collected_total{generation="0"} 123.0
 # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
@@ -43,22 +38,16 @@ sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 75
 sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
 """

-        def mock_generate_latest(reg):
-            return sample_metrics.encode("utf-8")
-
-        import dynamo.common.utils.prometheus
-
-        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
-        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
-
-        yield registry
-
-        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
-
-    def test_sglang_use_case(self, sglang_registry):
+    def test_sglang_use_case(self):
        """Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
+        registry = Mock()
+
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=self.SAMPLE_METRICS.encode("utf-8"),
+        ):
            result = get_prometheus_expfmt(
-            sglang_registry,
+                registry,
                metric_prefix_filters=["sglang:"],
                exclude_prefixes=["python_", "process_"],
            )
@@ -80,10 +69,12 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075

    def test_error_handling(self):
        """Test error handling when registry fails."""
-        # Create a registry that raises an exception
        bad_registry = Mock()
-        bad_registry.side_effect = Exception("Registry error")

+        with patch(
+            "prometheus_client.generate_latest",
+            side_effect=Exception("Registry error"),
+        ):
            result = get_prometheus_expfmt(bad_registry)

        # Should return empty string on error

--- a/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
@@ -3,7 +3,7 @@

 """Unit tests for Prometheus utilities."""

-from unittest.mock import Mock
+from unittest.mock import Mock, patch

 import pytest

@@ -21,12 +21,7 @@ pytestmark = [
 class TestGetPrometheusExpfmt:
    """Test class for get_prometheus_expfmt function."""

-    @pytest.fixture
-    def trtllm_registry(self):
-        """Create a mock registry with TensorRT-LLM-style metrics (no existing prefixes)."""
-        registry = Mock()
-
-        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+    TRTLLM_SAMPLE_METRICS = """# HELP python_gc_objects_collected_total Objects collected during gc
 # TYPE python_gc_objects_collected_total counter
 python_gc_objects_collected_total{generation="0"} 123.0
 # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
@@ -44,22 +39,16 @@ num_requests_running 3.0
 tokens_per_second 245.7
 """

-        def mock_generate_latest(reg):
-            return sample_metrics.encode("utf-8")
-
-        import dynamo.common.utils.prometheus
-
-        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
-        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
-
-        yield registry
-
-        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
-
-    def test_trtllm_use_case(self, trtllm_registry):
+    def test_trtllm_use_case(self):
        """Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
+        registry = Mock()
+
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=self.TRTLLM_SAMPLE_METRICS.encode("utf-8"),
+        ):
            result = get_prometheus_expfmt(
-            trtllm_registry,
+                registry,
                exclude_prefixes=["python_", "process_"],
                add_prefix="trtllm_",
            )
@@ -82,9 +71,15 @@ tokens_per_second 245.7
        assert "trtllm_tokens_per_second 245.7" in result
        assert result.endswith("\n")

-    def test_no_filtering_all_frameworks(self, trtllm_registry):
+    def test_no_filtering_all_frameworks(self):
        """Test that without any filters, all metrics are returned."""
-        result = get_prometheus_expfmt(trtllm_registry)
+        registry = Mock()
+
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=self.TRTLLM_SAMPLE_METRICS.encode("utf-8"),
+        ):
+            result = get_prometheus_expfmt(registry)

        # Should contain all metrics including excluded ones
        assert "python_gc_objects_collected_total" in result
@@ -93,10 +88,16 @@ tokens_per_second 245.7
        assert "num_requests_running" in result
        assert result.endswith("\n")

-    def test_empty_result_handling(self, trtllm_registry):
+    def test_empty_result_handling(self):
        """Test handling when all metrics are filtered out."""
+        registry = Mock()
+
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=self.TRTLLM_SAMPLE_METRICS.encode("utf-8"),
+        ):
            result = get_prometheus_expfmt(
-            trtllm_registry,
+                registry,
                exclude_prefixes=["python_", "process_", "request_", "num_", "tokens_"],
            )

@@ -116,15 +117,10 @@ trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
 trtllm_time_to_first_token_seconds_count 5.0
 """

-        def mock_generate_latest(reg):
-            return sample_metrics.encode("utf-8")
-
-        import dynamo.common.utils.prometheus
-
-        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
-        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
-
-        try:
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=sample_metrics.encode("utf-8"),
+        ):
            result = get_prometheus_expfmt(
                registry,
                exclude_prefixes=["python_", "process_"],
@@ -136,15 +132,15 @@ trtllm_time_to_first_token_seconds_count 5.0
        assert "trtllm_request_success_total" in result
        assert "trtllm_time_to_first_token_seconds" in result
        assert result.endswith("\n")
-        finally:
-            dynamo.common.utils.prometheus.generate_latest = original_generate_latest

    def test_error_handling(self):
        """Test error handling when registry fails."""
-        # Create a registry that raises an exception
        bad_registry = Mock()
-        bad_registry.side_effect = Exception("Registry error")

+        with patch(
+            "prometheus_client.generate_latest",
+            side_effect=Exception("Registry error"),
+        ):
            result = get_prometheus_expfmt(bad_registry)

        # Should return empty string on error

--- a/components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
@@ -3,7 +3,7 @@

 """Unit tests for Prometheus utilities."""

-from unittest.mock import Mock
+from unittest.mock import Mock, patch

 import pytest

@@ -21,12 +21,7 @@ pytestmark = [
 class TestGetPrometheusExpfmt:
    """Test class for get_prometheus_expfmt function."""

-    @pytest.fixture
-    def vllm_registry(self):
-        """Create a mock registry with vLLM-style metrics."""
-        registry = Mock()
-
-        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+    SAMPLE_METRICS = """# HELP python_gc_objects_collected_total Objects collected during gc
 # TYPE python_gc_objects_collected_total counter
 python_gc_objects_collected_total{generation="0"} 123.0
 # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
@@ -41,22 +36,16 @@ vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-
 vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
 """

-        def mock_generate_latest(reg):
-            return sample_metrics.encode("utf-8")
-
-        import dynamo.common.utils.prometheus
-
-        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
-        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
-
-        yield registry
-
-        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
-
-    def test_vllm_use_case(self, vllm_registry):
+    def test_vllm_use_case(self):
        """Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
+        registry = Mock()
+
+        with patch(
+            "prometheus_client.generate_latest",
+            return_value=self.SAMPLE_METRICS.encode("utf-8"),
+        ):
            result = get_prometheus_expfmt(
-            vllm_registry,
+                registry,
                metric_prefix_filters=["vllm:"],
                exclude_prefixes=["python_", "process_"],
            )
@@ -77,10 +66,12 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165

    def test_error_handling(self):
        """Test error handling when registry fails."""
-        # Create a registry that raises an exception
        bad_registry = Mock()
-        bad_registry.side_effect = Exception("Registry error")

+        with patch(
+            "prometheus_client.generate_latest",
+            side_effect=Exception("Registry error"),
+        ):
            result = get_prometheus_expfmt(bad_registry)

        # Should return empty string on error