Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
8f868986
Unverified
Commit
8f868986
authored
Nov 10, 2025
by
Aryan Bagade
Committed by
GitHub
Nov 11, 2025
Browse files
feat: Add output token counter to frontend metrics (#4202)
Signed-off-by:
Aryan Bagade
<
aryan@aryanbagade.com
>
parent
bba41321
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
186 additions
and
7 deletions
+186
-7
lib/bindings/python/src/dynamo/prometheus_names.py
lib/bindings/python/src/dynamo/prometheus_names.py
+17
-7
lib/llm/src/http/service/metrics.rs
lib/llm/src/http/service/metrics.rs
+166
-0
lib/runtime/src/metrics/prometheus_names.rs
lib/runtime/src/metrics/prometheus_names.rs
+3
-0
No files found.
lib/bindings/python/src/dynamo/prometheus_names.py
View file @
8f868986
...
@@ -55,6 +55,8 @@ class frontend_service:
...
@@ -55,6 +55,8 @@ class frontend_service:
INPUT_SEQUENCE_TOKENS
=
"input_sequence_tokens"
INPUT_SEQUENCE_TOKENS
=
"input_sequence_tokens"
# Output sequence length in tokens
# Output sequence length in tokens
OUTPUT_SEQUENCE_TOKENS
=
"output_sequence_tokens"
OUTPUT_SEQUENCE_TOKENS
=
"output_sequence_tokens"
# Total number of output tokens generated (counter that updates in real-time)
OUTPUT_TOKENS_TOTAL
=
"output_tokens_total"
# Time to first token in seconds
# Time to first token in seconds
TIME_TO_FIRST_TOKEN_SECONDS
=
"time_to_first_token_seconds"
TIME_TO_FIRST_TOKEN_SECONDS
=
"time_to_first_token_seconds"
# Inter-token latency in seconds
# Inter-token latency in seconds
...
@@ -76,13 +78,21 @@ class frontend_service:
...
@@ -76,13 +78,21 @@ class frontend_service:
MODEL_MIGRATION_LIMIT
=
"model_migration_limit"
MODEL_MIGRATION_LIMIT
=
"model_migration_limit"
class
kvbm_connector
:
class
kvbm
:
"""KVBM connector"""
"""KVBM"""
# KVBM connector leader
# The number of offload blocks from device to host
KVBM_CONNECTOR_LEADER
=
"kvbm_connector_leader"
OFFLOAD_BLOCKS_D2H
=
"offload_blocks_d2h"
# KVBM connector worker
# The number of offload blocks from host to disk
KVBM_CONNECTOR_WORKER
=
"kvbm_connector_worker"
OFFLOAD_BLOCKS_H2D
=
"offload_blocks_h2d"
# The number of offload blocks from device to disk (bypassing host memory)
OFFLOAD_BLOCKS_D2D
=
"offload_blocks_d2d"
# The number of onboard blocks from host to device
ONBOARD_BLOCKS_H2D
=
"onboard_blocks_h2d"
# The number of onboard blocks from disk to device
ONBOARD_BLOCKS_D2D
=
"onboard_blocks_d2d"
# The number of matched tokens
MATCHED_TOKENS
=
"matched_tokens"
class
kvrouter
:
class
kvrouter
:
...
...
lib/llm/src/http/service/metrics.rs
View file @
8f868986
...
@@ -161,6 +161,7 @@ pub struct Metrics {
...
@@ -161,6 +161,7 @@ pub struct Metrics {
request_duration
:
HistogramVec
,
request_duration
:
HistogramVec
,
input_sequence_length
:
HistogramVec
,
input_sequence_length
:
HistogramVec
,
output_sequence_length
:
HistogramVec
,
output_sequence_length
:
HistogramVec
,
output_tokens_counter
:
IntCounterVec
,
time_to_first_token
:
HistogramVec
,
time_to_first_token
:
HistogramVec
,
inter_token_latency
:
HistogramVec
,
inter_token_latency
:
HistogramVec
,
...
@@ -266,6 +267,7 @@ impl Metrics {
...
@@ -266,6 +267,7 @@ impl Metrics {
/// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
/// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
/// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
/// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
/// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
/// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
/// - `{prefix}_output_tokens_total` - IntCounterVec for total output tokens generated (real-time updates)
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
///
///
...
@@ -392,6 +394,15 @@ impl Metrics {
...
@@ -392,6 +394,15 @@ impl Metrics {
)
)
.unwrap
();
.unwrap
();
let
output_tokens_counter
=
IntCounterVec
::
new
(
Opts
::
new
(
frontend_metric_name
(
frontend_service
::
OUTPUT_TOKENS_TOTAL
),
"Total number of output tokens generated (updates in real-time)"
,
),
&
[
"model"
],
)
.unwrap
();
// Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
// Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
let
(
ttft_min
,
ttft_max
,
ttft_count
)
=
let
(
ttft_min
,
ttft_max
,
ttft_count
)
=
parse_bucket_config
(
"DYN_METRICS_TTFT"
,
0.001
,
480.0
,
18
);
parse_bucket_config
(
"DYN_METRICS_TTFT"
,
0.001
,
480.0
,
18
);
...
@@ -487,6 +498,7 @@ impl Metrics {
...
@@ -487,6 +498,7 @@ impl Metrics {
request_duration
,
request_duration
,
input_sequence_length
,
input_sequence_length
,
output_sequence_length
,
output_sequence_length
,
output_tokens_counter
,
time_to_first_token
,
time_to_first_token
,
inter_token_latency
,
inter_token_latency
,
model_total_kv_blocks
,
model_total_kv_blocks
,
...
@@ -581,6 +593,7 @@ impl Metrics {
...
@@ -581,6 +593,7 @@ impl Metrics {
registry
.register
(
Box
::
new
(
self
.request_duration
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.request_duration
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.input_sequence_length
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.input_sequence_length
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.output_sequence_length
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.output_sequence_length
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.output_tokens_counter
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.time_to_first_token
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.time_to_first_token
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.inter_token_latency
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.inter_token_latency
.clone
()))
?
;
...
@@ -832,6 +845,12 @@ impl ResponseMetricCollector {
...
@@ -832,6 +845,12 @@ impl ResponseMetricCollector {
return
;
return
;
}
}
// Increment the real-time output tokens counter
self
.metrics
.output_tokens_counter
.with_label_values
(
&
[
&
self
.model
])
.inc_by
(
num_tokens
as
u64
);
if
self
.is_first_token
{
if
self
.is_first_token
{
// NOTE: when there are multiple tokens in the first response,
// NOTE: when there are multiple tokens in the first response,
// we use the full response time as TTFT and ignore the ITL
// we use the full response time as TTFT and ignore the ITL
...
@@ -1187,4 +1206,151 @@ mod tests {
...
@@ -1187,4 +1206,151 @@ mod tests {
);
);
}
}
}
}
#[test]
fn
test_output_tokens_counter_increments
()
{
let
metrics
=
Arc
::
new
(
Metrics
::
new
());
let
registry
=
prometheus
::
Registry
::
new
();
metrics
.register
(
&
registry
)
.unwrap
();
let
model
=
"test-model"
;
// Create response collector
let
mut
collector
=
metrics
.clone
()
.create_response_collector
(
model
);
// Simulate first chunk (5 tokens)
collector
.observe_response
(
100
,
5
);
// Verify counter incremented by 5
let
counter_value
=
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
();
assert_eq!
(
counter_value
,
5
);
// Simulate second chunk (10 tokens)
collector
.observe_response
(
100
,
10
);
// Verify counter incremented to 15
let
counter_value
=
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
();
assert_eq!
(
counter_value
,
15
);
// Simulate third chunk (7 tokens)
collector
.observe_response
(
100
,
7
);
// Verify counter incremented to 22
let
counter_value
=
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
();
assert_eq!
(
counter_value
,
22
);
}
#[test]
fn
test_output_tokens_counter_zero_tokens
()
{
let
metrics
=
Arc
::
new
(
Metrics
::
new
());
let
registry
=
prometheus
::
Registry
::
new
();
metrics
.register
(
&
registry
)
.unwrap
();
let
model
=
"test-model"
;
let
mut
collector
=
metrics
.clone
()
.create_response_collector
(
model
);
// Simulate chunk with zero tokens (should not increment)
collector
.observe_response
(
100
,
0
);
// Verify counter remains 0
let
counter_value
=
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
();
assert_eq!
(
counter_value
,
0
);
// Add some tokens
collector
.observe_response
(
100
,
5
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
(),
5
);
// Try zero tokens again (should not change counter)
collector
.observe_response
(
100
,
0
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model
])
.get
(),
5
);
}
#[test]
fn
test_output_tokens_counter_multiple_models
()
{
let
metrics
=
Arc
::
new
(
Metrics
::
new
());
let
registry
=
prometheus
::
Registry
::
new
();
metrics
.register
(
&
registry
)
.unwrap
();
let
model1
=
"model-1"
;
let
model2
=
"model-2"
;
// Create collectors for different models
let
mut
collector1
=
metrics
.clone
()
.create_response_collector
(
model1
);
let
mut
collector2
=
metrics
.clone
()
.create_response_collector
(
model2
);
// Increment model1
collector1
.observe_response
(
100
,
10
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model1
])
.get
(),
10
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model2
])
.get
(),
0
);
// Increment model2
collector2
.observe_response
(
200
,
20
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model1
])
.get
(),
10
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model2
])
.get
(),
20
);
// Increment model1 again
collector1
.observe_response
(
100
,
5
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model1
])
.get
(),
15
);
assert_eq!
(
metrics
.output_tokens_counter
.with_label_values
(
&
[
model2
])
.get
(),
20
);
}
}
}
lib/runtime/src/metrics/prometheus_names.rs
View file @
8f868986
...
@@ -113,6 +113,9 @@ pub mod frontend_service {
...
@@ -113,6 +113,9 @@ pub mod frontend_service {
/// Output sequence length in tokens
/// Output sequence length in tokens
pub
const
OUTPUT_SEQUENCE_TOKENS
:
&
str
=
"output_sequence_tokens"
;
pub
const
OUTPUT_SEQUENCE_TOKENS
:
&
str
=
"output_sequence_tokens"
;
/// Total number of output tokens generated (counter that updates in real-time)
pub
const
OUTPUT_TOKENS_TOTAL
:
&
str
=
"output_tokens_total"
;
/// Time to first token in seconds
/// Time to first token in seconds
pub
const
TIME_TO_FIRST_TOKEN_SECONDS
:
&
str
=
"time_to_first_token_seconds"
;
pub
const
TIME_TO_FIRST_TOKEN_SECONDS
:
&
str
=
"time_to_first_token_seconds"
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment