Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
cd493b5a
Unverified
Commit
cd493b5a
authored
Jan 19, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 19, 2025
Browse files
Improve metrics, logging, and importing orders (#2992)
parent
61f42b57
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
63 additions
and
48 deletions
+63
-48
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+1
-1
examples/runtime/engine/offline_batch_inference.py
examples/runtime/engine/offline_batch_inference.py
+5
-0
python/sglang/__init__.py
python/sglang/__init__.py
+21
-23
python/sglang/lang/backend/runtime_endpoint.py
python/sglang/lang/backend/runtime_endpoint.py
+13
-7
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+4
-2
python/sglang/srt/metrics/collector.py
python/sglang/srt/metrics/collector.py
+15
-6
sgl-router/py_src/sglang_router/__init__.py
sgl-router/py_src/sglang_router/__init__.py
+3
-7
test/srt/run_suite.py
test/srt/run_suite.py
+1
-2
No files found.
.github/workflows/pr-test.yml
View file @
cd493b5a
...
@@ -52,7 +52,7 @@ jobs:
...
@@ -52,7 +52,7 @@ jobs:
runs-on
:
1-gpu-runner
runs-on
:
1-gpu-runner
strategy
:
strategy
:
matrix
:
matrix
:
range
:
[
0-6
,
6-1
6
,
1
6
-2
3
,
2
3
-3
0
,
3
0
-3
8
,
3
8
-100
]
range
:
[
0-6
,
6-1
5
,
1
5
-2
2
,
2
2
-3
2
,
3
2
-3
7
,
3
7
-100
]
steps
:
steps
:
-
name
:
Checkout code
-
name
:
Checkout code
uses
:
actions/checkout@v3
uses
:
actions/checkout@v3
...
...
examples/runtime/engine/offline_batch_inference.py
View file @
cd493b5a
"""
Usage:
python3 offline_batch_inference.py --model meta-llama/Llama-3.1-8B-Instruct
"""
import
argparse
import
argparse
import
dataclasses
import
dataclasses
...
...
python/sglang/__init__.py
View file @
cd493b5a
# SGL
API Component
s
# SGL
ang public API
s
# Frontend Language APIs
from
sglang.api
import
(
from
sglang.api
import
(
Engine
,
Engine
,
Runtime
,
Runtime
,
...
@@ -23,16 +24,26 @@ from sglang.api import (
...
@@ -23,16 +24,26 @@ from sglang.api import (
user_end
,
user_end
,
video
,
video
,
)
)
from
sglang.lang.backend.runtime_endpoint
import
RuntimeEndpoint
from
sglang.lang.choices
import
(
from
sglang.lang.choices
import
(
greedy_token_selection
,
greedy_token_selection
,
token_length_normalized
,
token_length_normalized
,
unconditional_likelihood_normalized
,
unconditional_likelihood_normalized
,
)
)
from
sglang.utils
import
LazyImport
Anthropic
=
LazyImport
(
"sglang.lang.backend.anthropic"
,
"Anthropic"
)
LiteLLM
=
LazyImport
(
"sglang.lang.backend.litellm"
,
"LiteLLM"
)
OpenAI
=
LazyImport
(
"sglang.lang.backend.openai"
,
"OpenAI"
)
VertexAI
=
LazyImport
(
"sglang.lang.backend.vertexai"
,
"VertexAI"
)
# Other configs
from
sglang.global_config
import
global_config
from
sglang.version
import
__version__
# SGLang DSL APIs
__all__
=
[
__all__
=
[
"Runtime"
,
"Engine"
,
"Engine"
,
"Runtime"
,
"assistant"
,
"assistant"
,
"assistant_begin"
,
"assistant_begin"
,
"assistant_end"
,
"assistant_end"
,
...
@@ -52,27 +63,14 @@ __all__ = [
...
@@ -52,27 +63,14 @@ __all__ = [
"user_begin"
,
"user_begin"
,
"user_end"
,
"user_end"
,
"video"
,
"video"
,
"RuntimeEndpoint"
,
"greedy_token_selection"
,
"greedy_token_selection"
,
"token_length_normalized"
,
"token_length_normalized"
,
"unconditional_likelihood_normalized"
,
"unconditional_likelihood_normalized"
,
"Anthropic"
,
"LiteLLM"
,
"OpenAI"
,
"VertexAI"
,
"global_config"
,
"__version__"
,
]
]
# Global Configurations
from
sglang.global_config
import
global_config
__all__
+=
[
"global_config"
]
from
sglang.version
import
__version__
__all__
+=
[
"__version__"
]
# SGLang Backends
from
sglang.lang.backend.runtime_endpoint
import
RuntimeEndpoint
from
sglang.utils
import
LazyImport
Anthropic
=
LazyImport
(
"sglang.lang.backend.anthropic"
,
"Anthropic"
)
LiteLLM
=
LazyImport
(
"sglang.lang.backend.litellm"
,
"LiteLLM"
)
OpenAI
=
LazyImport
(
"sglang.lang.backend.openai"
,
"OpenAI"
)
VertexAI
=
LazyImport
(
"sglang.lang.backend.vertexai"
,
"VertexAI"
)
__all__
+=
[
"Anthropic"
,
"LiteLLM"
,
"OpenAI"
,
"VertexAI"
,
"RuntimeEndpoint"
]
python/sglang/lang/backend/runtime_endpoint.py
View file @
cd493b5a
...
@@ -19,9 +19,6 @@ from sglang.lang.ir import (
...
@@ -19,9 +19,6 @@ from sglang.lang.ir import (
REGEX_STR
,
REGEX_STR
,
SglSamplingParams
,
SglSamplingParams
,
)
)
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
is_port_available
,
kill_process_tree
from
sglang.utils
import
http_request
from
sglang.utils
import
http_request
...
@@ -342,7 +339,7 @@ class Runtime:
...
@@ -342,7 +339,7 @@ class Runtime:
using the commond line interface.
using the commond line interface.
It is mainly used for the frontend language.
It is mainly used for the frontend language.
You should use the Engine class if you want to do normal offline processing.
You should use the Engine class if you want to do normal offline processing
without the frontend language
.
"""
"""
def
__init__
(
def
__init__
(
...
@@ -352,13 +349,14 @@ class Runtime:
...
@@ -352,13 +349,14 @@ class Runtime:
**
kwargs
,
**
kwargs
,
):
):
"""See the arguments in server_args.py::ServerArgs"""
"""See the arguments in server_args.py::ServerArgs"""
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
# client code without installing SRT server and its dependency if they want.
from
sglang.srt.server
import
launch_server
from
sglang.srt.server
import
launch_server
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
is_port_available
self
.
server_args
=
ServerArgs
(
*
args
,
log_level
=
log_level
,
**
kwargs
)
self
.
server_args
=
ServerArgs
(
*
args
,
log_level
=
log_level
,
**
kwargs
)
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit
.
register
(
self
.
shutdown
)
# Pre-allocate ports
# Pre-allocate ports
for
port
in
range
(
self
.
server_args
.
port
,
40000
):
for
port
in
range
(
self
.
server_args
.
port
,
40000
):
if
is_port_available
(
port
):
if
is_port_available
(
port
):
...
@@ -380,6 +378,10 @@ class Runtime:
...
@@ -380,6 +378,10 @@ class Runtime:
pipe_writer
.
close
()
pipe_writer
.
close
()
self
.
pid
=
proc
.
pid
self
.
pid
=
proc
.
pid
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit
.
register
(
self
.
shutdown
)
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
try
:
try
:
init_state
=
pipe_reader
.
recv
()
init_state
=
pipe_reader
.
recv
()
except
EOFError
:
except
EOFError
:
...
@@ -394,6 +396,8 @@ class Runtime:
...
@@ -394,6 +396,8 @@ class Runtime:
self
.
endpoint
=
RuntimeEndpoint
(
self
.
url
)
self
.
endpoint
=
RuntimeEndpoint
(
self
.
url
)
def
shutdown
(
self
):
def
shutdown
(
self
):
from
sglang.srt.utils
import
kill_process_tree
if
self
.
pid
is
not
None
:
if
self
.
pid
is
not
None
:
kill_process_tree
(
self
.
pid
)
kill_process_tree
(
self
.
pid
)
self
.
pid
=
None
self
.
pid
=
None
...
@@ -402,6 +406,8 @@ class Runtime:
...
@@ -402,6 +406,8 @@ class Runtime:
self
.
endpoint
.
cache_prefix
(
prefix
)
self
.
endpoint
.
cache_prefix
(
prefix
)
def
get_tokenizer
(
self
):
def
get_tokenizer
(
self
):
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
return
get_tokenizer
(
return
get_tokenizer
(
self
.
server_args
.
tokenizer_path
,
self
.
server_args
.
tokenizer_path
,
tokenizer_mode
=
self
.
server_args
.
tokenizer_mode
,
tokenizer_mode
=
self
.
server_args
.
tokenizer_mode
,
...
...
python/sglang/srt/managers/scheduler.py
View file @
cd493b5a
...
@@ -785,8 +785,9 @@ class Scheduler:
...
@@ -785,8 +785,9 @@ class Scheduler:
f
"gen throughput (token/s):
{
gen_throughput
:.
2
f
}
, "
f
"gen throughput (token/s):
{
gen_throughput
:.
2
f
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
)
)
spec_accept_length
=
0
else
:
else
:
accept_length
=
(
spec_
accept_length
=
(
self
.
spec_num_total_accepted_tokens
/
self
.
spec_num_total_forward_ct
self
.
spec_num_total_accepted_tokens
/
self
.
spec_num_total_forward_ct
)
)
self
.
spec_num_total_accepted_tokens
=
self
.
spec_num_total_forward_ct
=
0
self
.
spec_num_total_accepted_tokens
=
self
.
spec_num_total_forward_ct
=
0
...
@@ -795,7 +796,7 @@ class Scheduler:
...
@@ -795,7 +796,7 @@ class Scheduler:
f
"#running-req:
{
num_running_reqs
}
, "
f
"#running-req:
{
num_running_reqs
}
, "
f
"#token:
{
num_used
}
, "
f
"#token:
{
num_used
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"accept len:
{
accept_length
:.
2
f
}
, "
f
"accept len:
{
spec_
accept_length
:.
2
f
}
, "
f
"gen throughput (token/s):
{
gen_throughput
:.
2
f
}
, "
f
"gen throughput (token/s):
{
gen_throughput
:.
2
f
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
)
)
...
@@ -807,6 +808,7 @@ class Scheduler:
...
@@ -807,6 +808,7 @@ class Scheduler:
self
.
stats
.
token_usage
=
num_used
/
self
.
max_total_num_tokens
self
.
stats
.
token_usage
=
num_used
/
self
.
max_total_num_tokens
self
.
stats
.
gen_throughput
=
gen_throughput
self
.
stats
.
gen_throughput
=
gen_throughput
self
.
stats
.
num_queue_reqs
=
len
(
self
.
waiting_queue
)
self
.
stats
.
num_queue_reqs
=
len
(
self
.
waiting_queue
)
self
.
stats
.
spec_accept_length
=
spec_accept_length
self
.
metrics_collector
.
log_stats
(
self
.
stats
)
self
.
metrics_collector
.
log_stats
(
self
.
stats
)
def
check_memory
(
self
):
def
check_memory
(
self
):
...
...
python/sglang/srt/metrics/collector.py
View file @
cd493b5a
...
@@ -25,6 +25,7 @@ class SchedulerStats:
...
@@ -25,6 +25,7 @@ class SchedulerStats:
gen_throughput
:
float
=
0.0
gen_throughput
:
float
=
0.0
num_queue_reqs
:
int
=
0
num_queue_reqs
:
int
=
0
cache_hit_rate
:
float
=
0.0
cache_hit_rate
:
float
=
0.0
spec_accept_length
:
float
=
0.0
class
SchedulerMetricsCollector
:
class
SchedulerMetricsCollector
:
...
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
...
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
self
.
num_running_reqs
=
Gauge
(
self
.
num_running_reqs
=
Gauge
(
name
=
"sglang:num_running_reqs"
,
name
=
"sglang:num_running_reqs"
,
documentation
=
"The number of running requests"
,
documentation
=
"The number of running requests
.
"
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"sum"
,
multiprocess_mode
=
"sum"
,
)
)
self
.
num_used_tokens
=
Gauge
(
self
.
num_used_tokens
=
Gauge
(
name
=
"sglang:num_used_tokens"
,
name
=
"sglang:num_used_tokens"
,
documentation
=
"The number of used tokens"
,
documentation
=
"The number of used tokens
.
"
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"sum"
,
multiprocess_mode
=
"sum"
,
)
)
self
.
token_usage
=
Gauge
(
self
.
token_usage
=
Gauge
(
name
=
"sglang:token_usage"
,
name
=
"sglang:token_usage"
,
documentation
=
"The token usage"
,
documentation
=
"The token usage
.
"
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"mostrecent"
,
multiprocess_mode
=
"mostrecent"
,
)
)
self
.
gen_throughput
=
Gauge
(
self
.
gen_throughput
=
Gauge
(
name
=
"sglang:gen_throughput"
,
name
=
"sglang:gen_throughput"
,
documentation
=
"The generat
e
throughput (token/s)"
,
documentation
=
"The generat
ion
throughput (token/s)
.
"
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"sum"
,
multiprocess_mode
=
"sum"
,
)
)
self
.
num_queue_reqs
=
Gauge
(
self
.
num_queue_reqs
=
Gauge
(
name
=
"sglang:num_queue_reqs"
,
name
=
"sglang:num_queue_reqs"
,
documentation
=
"The number of requests in the waiting queue"
,
documentation
=
"The number of requests in the waiting queue
.
"
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"sum"
,
multiprocess_mode
=
"sum"
,
)
)
self
.
cache_hit_rate
=
Gauge
(
self
.
cache_hit_rate
=
Gauge
(
name
=
"sglang:cache_hit_rate"
,
name
=
"sglang:cache_hit_rate"
,
documentation
=
"The cache hit rate"
,
documentation
=
"The prefix cache hit rate."
,
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"mostrecent"
,
)
self
.
spec_accept_length
=
Gauge
(
name
=
"sglang:spec_accept_length"
,
documentation
=
"The average acceptance length of speculative decoding."
,
labelnames
=
labels
.
keys
(),
labelnames
=
labels
.
keys
(),
multiprocess_mode
=
"mostrecent"
,
multiprocess_mode
=
"mostrecent"
,
)
)
...
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
...
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
self
.
_log_gauge
(
self
.
gen_throughput
,
stats
.
gen_throughput
)
self
.
_log_gauge
(
self
.
gen_throughput
,
stats
.
gen_throughput
)
self
.
_log_gauge
(
self
.
num_queue_reqs
,
stats
.
num_queue_reqs
)
self
.
_log_gauge
(
self
.
num_queue_reqs
,
stats
.
num_queue_reqs
)
self
.
_log_gauge
(
self
.
cache_hit_rate
,
stats
.
cache_hit_rate
)
self
.
_log_gauge
(
self
.
cache_hit_rate
,
stats
.
cache_hit_rate
)
self
.
_log_gauge
(
self
.
spec_accept_length
,
stats
.
spec_accept_length
)
class
TokenizerMetricsCollector
:
class
TokenizerMetricsCollector
:
...
...
sgl-router/py_src/sglang_router/__init__.py
View file @
cd493b5a
# a lightweihgt wrapper on router with argument type and comments
# a lightweihgt wrapper on router with argument type and comments
from
sglang_router_rs
import
PolicyType
# no wrapper on policy type => direct export
# no wrapper on policy type => direct export
from
.router
import
Router
from
sglang_router.router
import
Router
__all__
=
[
"Router"
,
"PolicyType"
]
from
sglang_router.version
import
__version__
from
sglang_router.version
import
__version__
from
sglang_router_rs
import
PolicyType
__all__
+
=
[
"__version__"
]
__all__
=
[
"Router"
,
"PolicyType"
,
"__version__"
]
test/srt/run_suite.py
View file @
cd493b5a
...
@@ -42,8 +42,7 @@ suites = {
...
@@ -42,8 +42,7 @@ suites = {
"test_srt_endpoint.py"
,
"test_srt_endpoint.py"
,
"test_torch_compile.py"
,
"test_torch_compile.py"
,
"test_torch_compile_moe.py"
,
"test_torch_compile_moe.py"
,
# Temporarily disable this because it requires PyTorch >= 2.5
"test_torch_native_attention_backend.py"
,
# "test_torch_native_attention_backend.py",
"test_torchao.py"
,
"test_torchao.py"
,
"test_triton_attention_kernels.py"
,
"test_triton_attention_kernels.py"
,
"test_triton_attention_backend.py"
,
"test_triton_attention_backend.py"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment