Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
3e341fd6
"vscode:/vscode.git/clone" did not exist on "2ae9c29067cfebb7d30dc2a967de85c140cd9f47"
Unverified
Commit
3e341fd6
authored
Jan 03, 2026
by
ishandhanani
Committed by
GitHub
Jan 03, 2026
Browse files
fix(sglang): expose TokenizerMetricsCollector metrics via Prometheus (#5120)
parent
0980b27f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
112 deletions
+104
-112
components/src/dynamo/common/utils/prometheus.py
components/src/dynamo/common/utils/prometheus.py
+7
-2
components/src/dynamo/sglang/publisher.py
components/src/dynamo/sglang/publisher.py
+12
-3
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
...s/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
+19
-28
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
...s/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+47
-51
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
...nents/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
+19
-28
No files found.
components/src/dynamo/common/utils/prometheus.py
View file @
3e341fd6
...
...
@@ -16,8 +16,6 @@ import re
from
functools
import
lru_cache
from
typing
import
TYPE_CHECKING
,
Optional
,
Pattern
from
prometheus_client
import
generate_latest
from
dynamo._core
import
Endpoint
# Import CollectorRegistry only for type hints to avoid importing prometheus_client at module load time.
...
...
@@ -119,6 +117,11 @@ def get_prometheus_expfmt(
Collects all metrics from the registry and returns them in Prometheus text exposition format.
Optionally filters metrics by prefix, excludes certain prefixes, and adds a prefix.
IMPORTANT: prometheus_client is imported lazily here because it must be imported AFTER
set_prometheus_multiproc_dir() is called by SGLang's engine initialization. Importing
at module level causes prometheus_client to initialize in single-process mode before
PROMETHEUS_MULTIPROC_DIR is set, which breaks TokenizerMetricsCollector metrics.
Args:
registry: Prometheus registry to collect from.
Pass CollectorRegistry with MultiProcessCollector for SGLang.
...
...
@@ -138,6 +141,8 @@ def get_prometheus_expfmt(
# Filter out python_/process_ metrics and add trtllm_ prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
"""
from
prometheus_client
import
generate_latest
try
:
# Generate metrics in Prometheus text format
metrics_text
=
generate_latest
(
registry
).
decode
(
"utf-8"
)
...
...
components/src/dynamo/sglang/publisher.py
View file @
3e341fd6
...
...
@@ -4,14 +4,16 @@
import
asyncio
import
json
import
logging
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
import
sglang
as
sgl
import
zmq
import
zmq.asyncio
from
prometheus_client
import
CollectorRegistry
,
multiprocess
from
sglang.srt.utils
import
get_local_ip_auto
,
get_zmq_socket
,
maybe_wrap_ipv6_address
if
TYPE_CHECKING
:
from
prometheus_client
import
CollectorRegistry
from
dynamo.common.utils.prometheus
import
register_engine_metrics_callback
from
dynamo.llm
import
(
ForwardPassMetrics
,
...
...
@@ -224,7 +226,7 @@ class DynamoSglangPublisher:
def
setup_prometheus_registry
(
engine
:
sgl
.
Engine
,
generate_endpoint
:
Endpoint
)
->
CollectorRegistry
:
)
->
"
CollectorRegistry
"
:
"""Set up Prometheus registry for SGLang metrics collection.
SGLang uses multiprocess architecture where metrics are stored in shared memory.
...
...
@@ -232,6 +234,11 @@ def setup_prometheus_registry(
registry collects sglang:* metrics which are exposed via the metrics server endpoint
(set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081).
IMPORTANT: prometheus_client must be imported AFTER sgl.Engine() has called
set_prometheus_multiproc_dir(). Importing at module level causes prometheus_client
to initialize in single-process mode before PROMETHEUS_MULTIPROC_DIR is set,
which breaks TokenizerMetricsCollector metrics (TTFT, ITL, e2e latency, etc.).
Args:
engine: The SGLang engine instance.
generate_endpoint: The Dynamo endpoint for generation requests.
...
...
@@ -239,6 +246,8 @@ def setup_prometheus_registry(
Returns:
Configured CollectorRegistry with multiprocess support.
"""
from
prometheus_client
import
CollectorRegistry
,
multiprocess
registry
=
CollectorRegistry
()
multiprocess
.
MultiProcessCollector
(
registry
)
register_engine_metrics_callback
(
...
...
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
sglang_registry
(
self
):
"""Create a mock registry with SGLang-style metrics."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -43,25 +38,19 @@ sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 75
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_sglang_use_case
(
self
,
sglang_registry
):
def
test_sglang_use_case
(
self
):
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
result
=
get_prometheus_expfmt
(
sglang_registry
,
metric_prefix_filters
=
[
"sglang:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
metric_prefix_filters
=
[
"sglang:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
# Should only contain sglang: metrics
assert
"sglang:prompt_tokens_total"
in
result
...
...
@@ -80,11 +69,13 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
result
=
get_prometheus_expfmt
(
bad_registry
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
assert
result
==
""
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
trtllm_registry
(
self
):
"""Create a mock registry with TensorRT-LLM-style metrics (no existing prefixes)."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
TRTLLM_SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -44,25 +39,19 @@ num_requests_running 3.0
tokens_per_second 245.7
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_trtllm_use_case
(
self
,
trtllm_registry
):
def
test_trtllm_use_case
(
self
):
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
result
=
get_prometheus_expfmt
(
trtllm_registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
add_prefix
=
"trtllm_"
,
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
add_prefix
=
"trtllm_"
,
)
# Should not contain excluded metrics
assert
"python_gc_objects_collected_total"
not
in
result
...
...
@@ -82,9 +71,15 @@ tokens_per_second 245.7
assert
"trtllm_tokens_per_second 245.7"
in
result
assert
result
.
endswith
(
"
\n
"
)
def
test_no_filtering_all_frameworks
(
self
,
trtllm_registry
):
def
test_no_filtering_all_frameworks
(
self
):
"""Test that without any filters, all metrics are returned."""
result
=
get_prometheus_expfmt
(
trtllm_registry
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
)
# Should contain all metrics including excluded ones
assert
"python_gc_objects_collected_total"
in
result
...
...
@@ -93,12 +88,18 @@ tokens_per_second 245.7
assert
"num_requests_running"
in
result
assert
result
.
endswith
(
"
\n
"
)
def
test_empty_result_handling
(
self
,
trtllm_registry
):
def
test_empty_result_handling
(
self
):
"""Test handling when all metrics are filtered out."""
result
=
get_prometheus_expfmt
(
trtllm_registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
,
"request_"
,
"num_"
,
"tokens_"
],
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
TRTLLM_SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
,
"request_"
,
"num_"
,
"tokens_"
],
)
# Should return empty string with newline or just newline
assert
result
==
"
\n
"
or
result
==
""
...
...
@@ -116,36 +117,31 @@ trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
trtllm_time_to_first_token_seconds_count 5.0
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
try
:
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
sample_metrics
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
add_prefix
=
"trtllm_"
,
)
# Should not double-add prefix
assert
"trtllm_trtllm_request_success_total"
not
in
result
assert
"trtllm_request_success_total"
in
result
assert
"trtllm_time_to_first_token_seconds"
in
result
assert
result
.
endswith
(
"
\n
"
)
finally
:
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
# Should not double-add prefix
assert
"trtllm_trtllm_request_success_total"
not
in
result
assert
"trtllm_request_success_total"
in
result
assert
"trtllm_time_to_first_token_seconds"
in
result
assert
result
.
endswith
(
"
\n
"
)
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
result
=
get_prometheus_expfmt
(
bad_registry
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
assert
result
==
""
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
View file @
3e341fd6
...
...
@@ -3,7 +3,7 @@
"""Unit tests for Prometheus utilities."""
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
,
patch
import
pytest
...
...
@@ -21,12 +21,7 @@ pytestmark = [
class
TestGetPrometheusExpfmt
:
"""Test class for get_prometheus_expfmt function."""
@
pytest
.
fixture
def
vllm_registry
(
self
):
"""Create a mock registry with vLLM-style metrics."""
registry
=
Mock
()
sample_metrics
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
SAMPLE_METRICS
=
"""# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
...
...
@@ -41,25 +36,19 @@ vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-
vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
"""
def
mock_generate_latest
(
reg
):
return
sample_metrics
.
encode
(
"utf-8"
)
import
dynamo.common.utils.prometheus
original_generate_latest
=
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
mock_generate_latest
yield
registry
dynamo
.
common
.
utils
.
prometheus
.
generate_latest
=
original_generate_latest
def
test_vllm_use_case
(
self
,
vllm_registry
):
def
test_vllm_use_case
(
self
):
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
result
=
get_prometheus_expfmt
(
vllm_registry
,
metric_prefix_filters
=
[
"vllm:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
registry
=
Mock
()
with
patch
(
"prometheus_client.generate_latest"
,
return_value
=
self
.
SAMPLE_METRICS
.
encode
(
"utf-8"
),
):
result
=
get_prometheus_expfmt
(
registry
,
metric_prefix_filters
=
[
"vllm:"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
# Should only contain vllm: metrics
assert
"vllm:request_success_total"
in
result
...
...
@@ -77,11 +66,13 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
def
test_error_handling
(
self
):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry
=
Mock
()
bad_registry
.
side_effect
=
Exception
(
"Registry error"
)
result
=
get_prometheus_expfmt
(
bad_registry
)
with
patch
(
"prometheus_client.generate_latest"
,
side_effect
=
Exception
(
"Registry error"
),
):
result
=
get_prometheus_expfmt
(
bad_registry
)
# Should return empty string on error
assert
result
==
""
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment