Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
95a4ed12
"vscode:/vscode.git/clone" did not exist on "7b4b8129e310ba577ded511a4a4c2d54bab7357d"
Unverified
Commit
95a4ed12
authored
Nov 08, 2024
by
Yudi Xue
Committed by
GitHub
Nov 08, 2024
Browse files
Fix metrics (#1963)
parent
d1150e9a
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
142 additions
and
8 deletions
+142
-8
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/metrics/metrics_collector.py
python/sglang/srt/metrics/metrics_collector.py
+51
-3
python/sglang/srt/metrics/metrics_types.py
python/sglang/srt/metrics/metrics_types.py
+0
-3
python/sglang/srt/server.py
python/sglang/srt/server.py
+3
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+3
-0
test/srt/test_enable_metrics.py
test/srt/test_enable_metrics.py
+84
-0
No files found.
python/pyproject.toml
View file @
95a4ed12
...
...
@@ -17,7 +17,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
[project.optional-dependencies]
runtime_common
=
[
"aiohttp"
,
"decord"
,
"fastapi"
,
"hf_transfer"
,
"huggingface_hub"
,
"interegular"
,
"orjson"
,
"packaging"
,
"pillow"
,
"psutil"
,
"pydantic"
,
"python-multipart"
,
"orjson"
,
"packaging"
,
"pillow"
,
"prometheus-client>=0.20.0"
,
"psutil"
,
"pydantic"
,
"python-multipart"
,
"torchao"
,
"uvicorn"
,
"uvloop"
,
"zmq"
,
"outlines>=0.0.44"
,
"modelscope"
]
srt
=
["sglang[runtime_common]
", "
torch
", "
vllm==
0.6.3
.post
1
"]
...
...
python/sglang/srt/metrics/metrics_collector.py
View file @
95a4ed12
...
...
@@ -213,19 +213,67 @@ class Metrics:
name
=
"sglang:e2e_request_latency_seconds"
,
documentation
=
"Histogram of End-to-end request latency in seconds"
,
labelnames
=
labelnames
,
buckets
=
build_1_2_5_buckets
(
max_model_len
),
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
,
],
)
self
.
histogram_time_waiting_requests
=
Histogram
(
name
=
"sglang:waiting_request_latency_seconds"
,
documentation
=
"Histogram of request waiting time in seconds"
,
labelnames
=
labelnames
,
buckets
=
build_1_2_5_buckets
(
max_model_len
),
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
,
],
)
self
.
histogram_time_decode_requests
=
Histogram
(
name
=
"sglang:decode_request_latency_seconds"
,
documentation
=
"Histogram of request decoding time in seconds"
,
labelnames
=
labelnames
,
buckets
=
build_1_2_5_buckets
(
max_model_len
),
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
,
],
)
...
...
python/sglang/srt/metrics/metrics_types.py
View file @
95a4ed12
...
...
@@ -34,15 +34,12 @@ class Stats:
num_running_req
:
int
=
0
num_waiting_req
:
int
=
0
gen_throughput
:
float
=
0.0
num_token
:
int
=
0
token_usage
:
float
=
0.0
waiting_queue
:
int
=
0
time_e2e_requests
:
List
[
float
]
=
field
(
default_factory
=
list
)
time_waiting_requests
:
List
[
float
]
=
field
(
default_factory
=
list
)
time_decode_requests
:
List
[
float
]
=
field
(
default_factory
=
list
)
# system stats
token_usage
:
float
=
0.0
is_mixed_chunk
:
bool
=
False
new_seq
:
int
=
0
new_token
:
int
=
0
cached_token
:
int
=
0
...
...
python/sglang/srt/server.py
View file @
95a4ed12
...
...
@@ -446,6 +446,9 @@ def launch_server(
2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
"""
if
server_args
.
enable_metrics
:
_set_prometheus_env
()
launch_engine
(
server_args
=
server_args
)
# Add api key authorization
...
...
@@ -454,7 +457,6 @@ def launch_server(
# add prometheus middleware
if
server_args
.
enable_metrics
:
_set_prometheus_env
()
add_prometheus_middleware
(
app
)
# Send a warmup request
...
...
python/sglang/test/test_utils.py
View file @
95a4ed12
...
...
@@ -404,6 +404,7 @@ def popen_launch_server(
other_args
:
tuple
=
(),
env
:
Optional
[
dict
]
=
None
,
return_stdout_stderr
:
Optional
[
tuple
]
=
None
,
enable_metrics
:
bool
=
False
,
):
_
,
host
,
port
=
base_url
.
split
(
":"
)
host
=
host
[
2
:]
...
...
@@ -422,6 +423,8 @@ def popen_launch_server(
]
if
api_key
:
command
+=
[
"--api-key"
,
api_key
]
if
enable_metrics
:
command
+=
[
"--enable-metrics"
]
if
return_stdout_stderr
:
process
=
subprocess
.
Popen
(
...
...
test/srt/test_enable_metrics.py
0 → 100644
View file @
95a4ed12
import
unittest
from
types
import
SimpleNamespace
import
requests
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
TEST_MODEL
=
(
DEFAULT_MODEL_NAME_FOR_TEST
# I used "google/gemma-2-2b-it" for testing locally
)
class
TestEnableMetrics
(
unittest
.
TestCase
):
def
test_metrics_enabled
(
self
):
"""Test that metrics endpoint returns data when enabled"""
# Launch server with metrics enabled
process
=
popen_launch_server
(
model
=
TEST_MODEL
,
base_url
=
DEFAULT_URL_FOR_TEST
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
enable_metrics
=
True
,
)
try
:
# Make a request to generate some metrics
response
=
requests
.
get
(
f
"
{
DEFAULT_URL_FOR_TEST
}
/health_generate"
)
self
.
assertEqual
(
response
.
status_code
,
200
)
# Get metrics
metrics_response
=
requests
.
get
(
f
"
{
DEFAULT_URL_FOR_TEST
}
/metrics"
)
self
.
assertEqual
(
metrics_response
.
status_code
,
200
)
metrics_content
=
metrics_response
.
text
# Verify essential metrics are present
essential_metrics
=
[
"sglang:prompt_tokens_total"
,
"sglang:generation_tokens_total"
,
"sglang:max_total_num_tokens"
,
"sglang:context_len"
,
"sglang:time_to_first_token_seconds"
,
"sglang:time_per_output_token_seconds"
,
"sglang:e2e_request_latency_seconds"
,
]
for
metric
in
essential_metrics
:
self
.
assertIn
(
metric
,
metrics_content
,
f
"Missing metric:
{
metric
}
"
)
# Verify model name label is present and correct
expected_model_name
=
TEST_MODEL
self
.
assertIn
(
f
'model_name="
{
expected_model_name
}
"'
,
metrics_content
)
# Verify metrics have values (not empty)
self
.
assertIn
(
"_sum{"
,
metrics_content
)
self
.
assertIn
(
"_count{"
,
metrics_content
)
self
.
assertIn
(
"_bucket{"
,
metrics_content
)
finally
:
kill_child_process
(
process
.
pid
,
include_self
=
True
)
def
test_metrics_disabled
(
self
):
"""Test that metrics endpoint returns 404 when disabled"""
# Launch server with metrics disabled
process
=
popen_launch_server
(
model
=
TEST_MODEL
,
base_url
=
DEFAULT_URL_FOR_TEST
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
enable_metrics
=
False
,
)
try
:
response
=
requests
.
get
(
f
"
{
DEFAULT_URL_FOR_TEST
}
/health_generate"
)
self
.
assertEqual
(
response
.
status_code
,
200
)
# Verify metrics endpoint is not available
metrics_response
=
requests
.
get
(
f
"
{
DEFAULT_URL_FOR_TEST
}
/metrics"
)
self
.
assertEqual
(
metrics_response
.
status_code
,
404
)
finally
:
kill_child_process
(
process
.
pid
,
include_self
=
True
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment