Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
3e341fd6
Unverified
Commit
3e341fd6
authored
Jan 03, 2026
by
ishandhanani
Committed by
GitHub
Jan 03, 2026
Browse files
fix(sglang): expose TokenizerMetricsCollector metrics via Prometheus (#5120)
parent
0980b27f
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
112 deletions
+104
-112
components/src/dynamo/common/utils/prometheus.py
components/src/dynamo/common/utils/prometheus.py
+7
-2
components/src/dynamo/sglang/publisher.py
components/src/dynamo/sglang/publisher.py
+12
-3
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
...s/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
+19
-28
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
...s/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+47
-51
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
...nents/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
+19
-28
No files found.
components/src/dynamo/common/utils/prometheus.py
View file @
3e341fd6
...
...
@@ -16,8 +16,6 @@ import re
from
functools
import
lru_cache
from
typing
import
TYPE_CHECKING
,
Optional
,
Pattern
from
prometheus_client
import
generate_latest
from
dynamo._core
import
Endpoint
# Import CollectorRegistry only for type hints to avoid importing prometheus_client at module load time.
...
...
@@ -119,6 +117,11 @@ def get_prometheus_expfmt(
Collects all metrics from the registry and returns them in Prometheus text exposition format.
Optionally filters metrics by prefix, excludes certain prefixes, and adds a prefix.
IMPORTANT: prometheus_client is imported lazily here because it must be imported AFTER
set_prometheus_multiproc_dir() is called by SGLang's engine initialization. Importing
at module level causes prometheus_client to initialize in single-process mode before
PROMETHEUS_MULTIPROC_DIR is set, which breaks TokenizerMetricsCollector metrics.
Args:
registry: Prometheus registry to collect from.
Pass CollectorRegistry with MultiProcessCollector for SGLang.
...
...
@@ -138,6 +141,8 @@ def get_prometheus_expfmt(
# Filter out python_/process_ metrics and add trtllm_ prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
"""
from
prometheus_client
import
generate_latest
try
:
# Generate metrics in Prometheus text format
metrics_text
=
generate_latest
(
registry
).
decode
(
"utf-8"
)
...
...
components/src/dynamo/sglang/publisher.py
View file @
3e341fd6
...
...
@@ -4,14 +4,16 @@
import
asyncio
import
json
import
logging
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
import
sglang
as
sgl
import
zmq
import
zmq.asyncio
from
prometheus_client
import
CollectorRegistry
,
multiprocess
from
sglang.srt.utils
import
get_local_ip_auto
,
get_zmq_socket
,
maybe_wrap_ipv6_address
if
TYPE_CHECKING
:
from
prometheus_client
import
CollectorRegistry
from
dynamo.common.utils.prometheus
import
register_engine_metrics_callback
from
dynamo.llm
import
(
ForwardPassMetrics
,
...
...
@@ -224,7 +226,7 @@ class DynamoSglangPublisher:
def
setup_prometheus_registry
(
engine
:
sgl
.
Engine
,
generate_endpoint
:
Endpoint
)
->
CollectorRegistry
:
)
->
"
CollectorRegistry
"
:
"""Set up Prometheus registry for SGLang metrics collection.
SGLang uses multiprocess architecture where metrics are stored in shared memory.
...
...
@@ -232,6 +234,11 @@ def setup_prometheus_registry(
registry collects sglang:* metrics which are exposed via the metrics server endpoint
(set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081).
IMPORTANT: prometheus_client must be imported AFTER sgl.Engine() has called
set_prometheus_multiproc_dir(). Importing at module level causes prometheus_client
to initialize in single-process mode before PROMETHEUS_MULTIPROC_DIR is set,
which breaks TokenizerMetricsCollector metrics (TTFT, ITL, e2e latency, etc.).
Args:
engine: The SGLang engine instance.
generate_endpoint: The Dynamo endpoint for generation requests.
...
...
@@ -239,6 +246,8 @@ def setup_prometheus_registry(
Returns:
Configured CollectorRegistry with multiprocess support.
"""
from
prometheus_client
import
CollectorRegistry
,
multiprocess
registry
=
CollectorRegistry
()
multiprocess
.
MultiProcessCollector
(
registry
)
register_engine_metrics_callback
(
...
...
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
sglang_registry
(
self
):
"""Create a mock registry with SGLang-style metrics."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -43,22 +38,16 @@ sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 75
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_sglang_use_case
(
self
,
sglang_registry
):
def
test_sglang_use_case
(
self
):
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
sglang_
registry
,
registry
,
metric_prefix_filters
=
[
"sglang:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
...
...
@@ -80,10 +69,12 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
...
...
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
trtllm_registry
(
self
):
"""Create a mock registry with TensorRT-LLM-style metrics (no existing prefixes)."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
TRTLLM_SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -44,22 +39,16 @@ num_requests_running 3.0
tokens_per_second 245.7
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_trtllm_use_case
(
self
,
trtllm_registry
):
def
test_trtllm_use_case
(
self
):
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
trtllm_
registry
,
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
add_prefix
=
"trtllm_"
,
)
...
...
@@ -82,9 +71,15 @@ tokens_per_second 245.7
assert
"trtllm_tokens_per_second 245.7"
in
result
assert
result
.
endswith
(
"
\n
"
)
def
test_no_filtering_all_frameworks
(
self
,
trtllm_registry
):
def
test_no_filtering_all_frameworks
(
self
):
"""Test that without any filters, all metrics are returned."""
result
=
get_prometheus_expfmt
(
trtllm_registry
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
)
# Should contain all metrics including excluded ones
assert
"python_gc_objects_collected_total"
in
result
...
...
@@ -93,10 +88,16 @@ tokens_per_second 245.7
assert
"num_requests_running"
in
result
assert
result
.
endswith
(
"
\n
"
)
def
test_empty_result_handling
(
self
,
trtllm_registry
):
def
test_empty_result_handling
(
self
):
"""Test handling when all metrics are filtered out."""
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
trtllm_
registry
,
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
,
"request_"
,
"num_"
,
"tokens_"
],
)
...
...
@@ -116,15 +117,10 @@ trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
trtllm_time_to_first_token_seconds_count 5.0
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
try
:
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
sample_metrics
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
...
...
@@ -136,15 +132,15 @@ trtllm_time_to_first_token_seconds_count 5.0
assert
"trtllm_request_success_total"
in
result
assert
"trtllm_time_to_first_token_seconds"
in
result
assert
result
.
endswith
(
"
\n
"
)
finally
:
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
...
...
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
vllm_registry
(
self
):
"""Create a mock registry with vLLM-style metrics."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -41,22 +36,16 @@ vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-
vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_vllm_use_case
(
self
,
vllm_registry
):
def
test_vllm_use_case
(
self
):
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
vllm_
registry
,
registry
,
metric_prefix_filters
=
[
"vllm:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
...
...
@@ -77,10 +66,12 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment