Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
51ab3ccf
"examples/vscode:/vscode.git/clone" did not exist on "66d777e73ef87a9ee42f65ede5b9e256473fed24"
Unverified
Commit
51ab3ccf
authored
Jan 13, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 13, 2025
Browse files
Collect more metrics: num_requests_total (#2859)
parent
67008f4b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
45 deletions
+49
-45
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+38
-40
python/sglang/srt/metrics/collector.py
python/sglang/srt/metrics/collector.py
+10
-5
test/srt/test_metrics.py
test/srt/test_metrics.py
+1
-0
No files found.
python/sglang/srt/managers/tokenizer_manager.py
View file @
51ab3ccf
...
@@ -601,7 +601,7 @@ class TokenizerManager:
...
@@ -601,7 +601,7 @@ class TokenizerManager:
while
not
self
.
gracefully_exit
:
while
not
self
.
gracefully_exit
:
await
asyncio
.
sleep
(
5
)
await
asyncio
.
sleep
(
5
)
#
d
rain requests
#
D
rain requests
while
True
:
while
True
:
remain_num_req
=
len
(
self
.
rid_to_state
)
remain_num_req
=
len
(
self
.
rid_to_state
)
logger
.
info
(
logger
.
info
(
...
@@ -679,45 +679,7 @@ class TokenizerManager:
...
@@ -679,45 +679,7 @@ class TokenizerManager:
state
.
event
.
set
()
state
.
event
.
set
()
if
self
.
enable_metrics
:
if
self
.
enable_metrics
:
completion_tokens
=
(
self
.
collect_metrics
(
state
,
recv_obj
,
i
)
recv_obj
.
completion_tokens
[
i
]
if
getattr
(
recv_obj
,
"completion_tokens"
,
None
)
else
0
)
if
state
.
first_token_time
is
None
:
state
.
first_token_time
=
time
.
time
()
self
.
metrics_collector
.
observe_time_to_first_token
(
state
.
first_token_time
-
state
.
created_time
)
else
:
if
completion_tokens
>=
2
:
# Compute time_per_output_token for the streaming case
self
.
metrics_collector
.
observe_time_per_output_token
(
(
time
.
time
()
-
state
.
first_token_time
)
/
(
completion_tokens
-
1
)
)
if
state
.
finished
:
self
.
metrics_collector
.
inc_prompt_tokens
(
recv_obj
.
prompt_tokens
[
i
]
)
self
.
metrics_collector
.
inc_generation_tokens
(
completion_tokens
)
self
.
metrics_collector
.
observe_e2e_request_latency
(
time
.
time
()
-
state
.
created_time
)
# Compute time_per_output_token for the non-streaming case
if
(
hasattr
(
state
.
obj
,
"stream"
)
and
not
state
.
obj
.
stream
and
completion_tokens
>=
1
):
self
.
metrics_collector
.
observe_time_per_output_token
(
(
time
.
time
()
-
state
.
created_time
)
/
completion_tokens
)
elif
isinstance
(
recv_obj
,
OpenSessionReqOutput
):
elif
isinstance
(
recv_obj
,
OpenSessionReqOutput
):
self
.
session_futures
[
recv_obj
.
session_id
].
set_result
(
self
.
session_futures
[
recv_obj
.
session_id
].
set_result
(
recv_obj
.
session_id
if
recv_obj
.
success
else
None
recv_obj
.
session_id
if
recv_obj
.
success
else
None
...
@@ -820,6 +782,42 @@ class TokenizerManager:
...
@@ -820,6 +782,42 @@ class TokenizerManager:
ret
.
append
(
None
)
ret
.
append
(
None
)
return
ret
return
ret
def
collect_metrics
(
self
,
state
:
ReqState
,
recv_obj
:
BatchStrOut
,
i
:
int
):
completion_tokens
=
(
recv_obj
.
completion_tokens
[
i
]
if
getattr
(
recv_obj
,
"completion_tokens"
,
None
)
else
0
)
if
state
.
first_token_time
is
None
:
state
.
first_token_time
=
time
.
time
()
self
.
metrics_collector
.
observe_time_to_first_token
(
state
.
first_token_time
-
state
.
created_time
)
else
:
if
completion_tokens
>=
2
:
# Compute time_per_output_token for the streaming case
self
.
metrics_collector
.
observe_time_per_output_token
(
(
time
.
time
()
-
state
.
first_token_time
)
/
(
completion_tokens
-
1
)
)
if
state
.
finished
:
self
.
metrics_collector
.
observe_one_finished_request
(
recv_obj
.
prompt_tokens
[
i
],
completion_tokens
)
self
.
metrics_collector
.
observe_e2e_request_latency
(
time
.
time
()
-
state
.
created_time
)
# Compute time_per_output_token for the non-streaming case
if
(
hasattr
(
state
.
obj
,
"stream"
)
and
not
state
.
obj
.
stream
and
completion_tokens
>=
1
):
self
.
metrics_collector
.
observe_time_per_output_token
(
(
time
.
time
()
-
state
.
created_time
)
/
completion_tokens
)
class
SignalHandler
:
class
SignalHandler
:
def
__init__
(
self
,
tokenizer_manager
):
def
__init__
(
self
,
tokenizer_manager
):
...
...
python/sglang/srt/metrics/collector.py
View file @
51ab3ccf
...
@@ -109,6 +109,12 @@ class TokenizerMetricsCollector:
...
@@ -109,6 +109,12 @@ class TokenizerMetricsCollector:
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
)
)
self
.
num_requests_total
=
Counter
(
name
=
"sglang:num_requests_total"
,
documentation
=
"Number of requests processed."
,
labelnames
=
labels
.
keys
(),
)
self
.
histogram_time_to_first_token
=
Histogram
(
self
.
histogram_time_to_first_token
=
Histogram
(
name
=
"sglang:time_to_first_token_seconds"
,
name
=
"sglang:time_to_first_token_seconds"
,
documentation
=
"Histogram of time to first token in seconds."
,
documentation
=
"Histogram of time to first token in seconds."
,
...
@@ -185,11 +191,10 @@ class TokenizerMetricsCollector:
...
@@ -185,11 +191,10 @@ class TokenizerMetricsCollector:
# Convenience function for logging to counter.
# Convenience function for logging to counter.
counter
.
labels
(
**
self
.
labels
).
inc
(
data
)
counter
.
labels
(
**
self
.
labels
).
inc
(
data
)
def
inc_prompt_tokens
(
self
,
value
:
int
):
def
observe_one_finished_request
(
self
,
prompt_tokens
:
int
,
generation_tokens
:
int
):
self
.
_log_counter
(
self
.
prompt_tokens_total
,
value
)
self
.
prompt_tokens_total
.
labels
(
**
self
.
labels
).
inc
(
prompt_tokens
)
self
.
generation_tokens_total
.
labels
(
**
self
.
labels
).
inc
(
generation_tokens
)
def
inc_generation_tokens
(
self
,
value
:
int
):
self
.
num_requests_total
.
labels
(
**
self
.
labels
).
inc
(
1
)
self
.
_log_counter
(
self
.
generation_tokens_total
,
value
)
def
observe_time_to_first_token
(
self
,
value
:
Union
[
float
,
int
]):
def
observe_time_to_first_token
(
self
,
value
:
Union
[
float
,
int
]):
self
.
_log_histogram
(
self
.
histogram_time_to_first_token
,
value
)
self
.
_log_histogram
(
self
.
histogram_time_to_first_token
,
value
)
...
...
test/srt/test_metrics.py
View file @
51ab3ccf
...
@@ -59,6 +59,7 @@ class TestEnableMetrics(unittest.TestCase):
...
@@ -59,6 +59,7 @@ class TestEnableMetrics(unittest.TestCase):
"sglang:func_latency_seconds"
,
"sglang:func_latency_seconds"
,
"sglang:prompt_tokens_total"
,
"sglang:prompt_tokens_total"
,
"sglang:generation_tokens_total"
,
"sglang:generation_tokens_total"
,
"sglang:num_requests_total"
,
"sglang:time_to_first_token_seconds"
,
"sglang:time_to_first_token_seconds"
,
"sglang:time_per_output_token_seconds"
,
"sglang:time_per_output_token_seconds"
,
"sglang:e2e_request_latency_seconds"
,
"sglang:e2e_request_latency_seconds"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment