Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
887c2b45
Unverified
Commit
887c2b45
authored
Oct 13, 2025
by
Chang Su
Committed by
GitHub
Oct 13, 2025
Browse files
[router][grpc] Add `serve_grpc` to `launch_server` and log id for HealthCheck (#11564)
parent
065ce815
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
68 additions
and
93 deletions
+68
-93
python/sglang/launch_server.py
python/sglang/launch_server.py
+9
-2
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+5
-25
python/sglang/srt/grpc/grpc_request_manager.py
python/sglang/srt/grpc/grpc_request_manager.py
+0
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+1
-4
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+38
-38
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+2
-4
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-0
sgl-router/src/grpc_client/sglang_scheduler.rs
sgl-router/src/grpc_client/sglang_scheduler.rs
+6
-16
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+1
-4
No files found.
python/sglang/launch_server.py
View file @
887c2b45
"""Launch the inference server."""
"""Launch the inference server."""
import
asyncio
import
os
import
os
import
sys
import
sys
from
sglang.srt.entrypoints.http_server
import
launch_server
from
sglang.srt.server_args
import
prepare_server_args
from
sglang.srt.server_args
import
prepare_server_args
from
sglang.srt.utils
import
kill_process_tree
from
sglang.srt.utils
import
kill_process_tree
...
@@ -11,6 +11,13 @@ if __name__ == "__main__":
...
@@ -11,6 +11,13 @@ if __name__ == "__main__":
server_args
=
prepare_server_args
(
sys
.
argv
[
1
:])
server_args
=
prepare_server_args
(
sys
.
argv
[
1
:])
try
:
try
:
launch_server
(
server_args
)
if
server_args
.
grpc_mode
:
from
sglang.srt.entrypoints.grpc_server
import
serve_grpc
asyncio
.
run
(
serve_grpc
(
server_args
))
else
:
from
sglang.srt.entrypoints.http_server
import
launch_server
launch_server
(
server_args
)
finally
:
finally
:
kill_process_tree
(
os
.
getpid
(),
include_parent
=
False
)
kill_process_tree
(
os
.
getpid
(),
include_parent
=
False
)
python/sglang/srt/entrypoints/grpc_server.py
View file @
887c2b45
...
@@ -22,8 +22,8 @@ from grpc_reflection.v1alpha import reflection
...
@@ -22,8 +22,8 @@ from grpc_reflection.v1alpha import reflection
import
sglang
import
sglang
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
,
DisaggregationMode
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
,
DisaggregationMode
from
sglang.srt.entrypoints.grpc_request_manager
import
GrpcRequestManager
from
sglang.srt.grpc
import
sglang_scheduler_pb2
,
sglang_scheduler_pb2_grpc
from
sglang.srt.grpc
import
sglang_scheduler_pb2
,
sglang_scheduler_pb2_grpc
from
sglang.srt.grpc.grpc_request_manager
import
GrpcRequestManager
from
sglang.srt.managers.data_parallel_controller
import
(
from
sglang.srt.managers.data_parallel_controller
import
(
run_data_parallel_controller_process
,
run_data_parallel_controller_process
,
)
)
...
@@ -68,6 +68,8 @@ def _launch_scheduler_process_only(
...
@@ -68,6 +68,8 @@ def _launch_scheduler_process_only(
# Configure global environment
# Configure global environment
configure_logger
(
server_args
)
configure_logger
(
server_args
)
server_args
.
check_server_args
()
server_args
.
check_server_args
()
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
mp
.
set_start_method
(
"spawn"
,
force
=
True
)
# Allocate ports for inter-process communications
# Allocate ports for inter-process communications
if
port_args
is
None
:
if
port_args
is
None
:
...
@@ -317,7 +319,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -317,7 +319,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
Check the health of the inference server by sending a special request to generate one token.
Check the health of the inference server by sending a special request to generate one token.
Similar to HTTP server's /health endpoint.
Similar to HTTP server's /health endpoint.
"""
"""
logger
.
info
(
"Receive health check request"
)
rid
=
f
"HEALTH_CHECK_
{
time
.
time
()
}
"
logger
.
info
(
f
"Receive health check request:
{
rid
}
"
)
if
self
.
request_manager
.
gracefully_exit
:
if
self
.
request_manager
.
gracefully_exit
:
logger
.
info
(
logger
.
info
(
...
@@ -328,7 +331,6 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -328,7 +331,6 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
)
)
# Create a special health check request
# Create a special health check request
rid
=
f
"HEALTH_CHECK_
{
time
.
time
()
}
"
sampling_params
=
SGLSamplingParams
(
max_new_tokens
=
1
,
temperature
=
0.0
)
sampling_params
=
SGLSamplingParams
(
max_new_tokens
=
1
,
temperature
=
0.0
)
sampling_params
.
normalize
(
tokenizer
=
None
)
sampling_params
.
normalize
(
tokenizer
=
None
)
...
@@ -919,25 +921,3 @@ async def serve_grpc(
...
@@ -919,25 +921,3 @@ async def serve_grpc(
proc
.
join
(
timeout
=
1.0
)
proc
.
join
(
timeout
=
1.0
)
logger
.
info
(
"All scheduler processes terminated"
)
logger
.
info
(
"All scheduler processes terminated"
)
def
main
():
"""Main entry point for standalone gRPC server."""
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
mp
.
set_start_method
(
"spawn"
,
force
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
"SGLang Standalone gRPC Server"
)
ServerArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
server_args
=
ServerArgs
.
from_cli_args
(
args
)
# Run server
asyncio
.
run
(
serve_grpc
(
server_args
=
server_args
,
)
)
if
__name__
==
"__main__"
:
main
()
python/sglang/srt/
entrypoints
/grpc_request_manager.py
→
python/sglang/srt/
grpc
/grpc_request_manager.py
View file @
887c2b45
File moved
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
887c2b45
...
@@ -326,10 +326,7 @@ message EmbedError {
...
@@ -326,10 +326,7 @@ message EmbedError {
// Management Operations
// Management Operations
// =====================
// =====================
message
HealthCheckRequest
{
message
HealthCheckRequest
{}
// Input for health test generation (must be tokenized)
TokenizedInput
tokenized
=
1
;
}
message
HealthCheckResponse
{
message
HealthCheckResponse
{
bool
healthy
=
1
;
bool
healthy
=
1
;
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
887c2b45
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\
x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\
"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"\x15\n\x13
GetModelInfoRequest
\"\xea\x02\n\x14
GetModelInfoResponse
\x12\x12\n\n
model_path
\x18\x01
\x01
(
\t\x12\x16\n\x0e
tokenizer_path
\x18\x02
\x01
(
\t\x12\x15\n\r
is_generation
\x18\x03
\x01
(
\x08\x12
!
\n\x19
preferred_sampling_params
\x18\x04
\x01
(
\t\x12\x16\n\x0e
weight_version
\x18\x05
\x01
(
\t\x12\x19\n\x11
served_model_name
\x18\x06
\x01
(
\t\x12\x1a\n\x12
max_context_length
\x18\x07
\x01
(
\x05\x12\x12\n\n
vocab_size
\x18\x08
\x01
(
\x05\x12\x17\n\x0f
supports_vision
\x18\t
\x01
(
\x08\x12\x12\n\n
model_type
\x18\n
\x01
(
\t\x12\x15\n\r
eos_token_ids
\x18\x0b
\x03
(
\x05\x12\x14\n\x0c
pad_token_id
\x18\x0c
\x01
(
\x05\x12\x14\n\x0c\x62
os_token_id
\x18\r
\x01
(
\x05\x12\x19\n\x11
max_req_input_len
\x18\x0e
\x01
(
\x05\"\x16\n\x14
GetServerInfoRequest
\"\xb7\x02\n\x15
GetServerInfoResponse
\x12
,
\n\x0b
server_args
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12
/
\n\x0e
scheduler_info
\x18\x02
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x17\n\x0f\x61\x63
tive_requests
\x18\x03
\x01
(
\x05\x12\x11\n\t
is_paused
\x18\x04
\x01
(
\x08\x12\x1e\n\x16
last_receive_timestamp
\x18\x05
\x01
(
\x01\x12\x16\n\x0e
uptime_seconds
\x18\x06
\x01
(
\x01\x12\x16\n\x0e
sglang_version
\x18\x07
\x01
(
\t\x12\x13\n\x0b
server_type
\x18\x08
\x01
(
\t\x12
.
\n\n
start_time
\x18\t
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp2
\xd3\x04\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponse
\x12
g
\n\x0c
GetModelInfo
\x12
*.sglang.grpc.scheduler.GetModelInfoRequest
\x1a
+.sglang.grpc.scheduler.GetModelInfoResponse
\x12
j
\n\r
GetServerInfo
\x12
+.sglang.grpc.scheduler.GetServerInfoRequest
\x1a
,.sglang.grpc.scheduler.GetServerInfoResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
\x14
\n\x12
HealthCheckRequest
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"\x15\n\x13
GetModelInfoRequest
\"\xea\x02\n\x14
GetModelInfoResponse
\x12\x12\n\n
model_path
\x18\x01
\x01
(
\t\x12\x16\n\x0e
tokenizer_path
\x18\x02
\x01
(
\t\x12\x15\n\r
is_generation
\x18\x03
\x01
(
\x08\x12
!
\n\x19
preferred_sampling_params
\x18\x04
\x01
(
\t\x12\x16\n\x0e
weight_version
\x18\x05
\x01
(
\t\x12\x19\n\x11
served_model_name
\x18\x06
\x01
(
\t\x12\x1a\n\x12
max_context_length
\x18\x07
\x01
(
\x05\x12\x12\n\n
vocab_size
\x18\x08
\x01
(
\x05\x12\x17\n\x0f
supports_vision
\x18\t
\x01
(
\x08\x12\x12\n\n
model_type
\x18\n
\x01
(
\t\x12\x15\n\r
eos_token_ids
\x18\x0b
\x03
(
\x05\x12\x14\n\x0c
pad_token_id
\x18\x0c
\x01
(
\x05\x12\x14\n\x0c\x62
os_token_id
\x18\r
\x01
(
\x05\x12\x19\n\x11
max_req_input_len
\x18\x0e
\x01
(
\x05\"\x16\n\x14
GetServerInfoRequest
\"\xb7\x02\n\x15
GetServerInfoResponse
\x12
,
\n\x0b
server_args
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12
/
\n\x0e
scheduler_info
\x18\x02
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x17\n\x0f\x61\x63
tive_requests
\x18\x03
\x01
(
\x05\x12\x11\n\t
is_paused
\x18\x04
\x01
(
\x08\x12\x1e\n\x16
last_receive_timestamp
\x18\x05
\x01
(
\x01\x12\x16\n\x0e
uptime_seconds
\x18\x06
\x01
(
\x01\x12\x16\n\x0e
sglang_version
\x18\x07
\x01
(
\t\x12\x13\n\x0b
server_type
\x18\x08
\x01
(
\t\x12
.
\n\n
start_time
\x18\t
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp2
\xd3\x04\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponse
\x12
g
\n\x0c
GetModelInfo
\x12
*.sglang.grpc.scheduler.GetModelInfoRequest
\x1a
+.sglang.grpc.scheduler.GetModelInfoResponse
\x12
j
\n\r
GetServerInfo
\x12
+.sglang.grpc.scheduler.GetServerInfoRequest
\x1a
,.sglang.grpc.scheduler.GetServerInfoResponseb
\x06
proto3'
)
_globals
=
globals
()
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
@@ -79,41 +79,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
...
@@ -79,41 +79,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
3967
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
3967
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
4027
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
4027
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
4029
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
4029
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
107
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
049
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
109
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
051
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
416
4
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
41
0
6
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
66
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
08
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
4
216
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
4
158
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
4
218
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
4
160
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
67
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
09
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
69
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
11
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
4
342
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
4
284
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
4
344
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
4
286
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
4
416
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
4
358
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
4
418
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
4
360
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
4
457
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
4
399
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
44
59
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
44
01
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
4
513
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
4
455
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
4
515
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
4
457
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
4
634
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
4
576
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
4
636
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
4
578
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
46
9
3
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
463
5
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
46
95
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
46
37
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
4
740
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
4
682
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
742
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
684
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
808
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
750
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
810
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
752
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
487
5
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
48
1
7
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
77
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
19
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
937
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
879
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_start
=
4
939
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_start
=
4
881
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_end
=
49
6
0
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_end
=
490
2
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_start
=
49
63
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_start
=
49
05
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_end
=
5
325
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_end
=
5
267
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_start
=
5
327
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_start
=
5
269
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_end
=
5
349
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_end
=
5
291
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_start
=
5
352
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_start
=
5
294
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_end
=
56
63
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_end
=
56
05
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
56
66
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
56
08
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
62
61
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
62
03
# @@protoc_insertion_point(module_scope)
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
887c2b45
...
@@ -320,10 +320,8 @@ class EmbedError(_message.Message):
...
@@ -320,10 +320,8 @@ class EmbedError(_message.Message):
def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
class HealthCheckRequest(_message.Message):
class HealthCheckRequest(_message.Message):
__slots__ = ("tokenized",)
__slots__ = ()
TOKENIZED_FIELD_NUMBER: _ClassVar[int]
def __init__(self) -> None: ...
tokenized: TokenizedInput
def __init__(self, tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ...) -> None: ...
class HealthCheckResponse(_message.Message):
class HealthCheckResponse(_message.Message):
__slots__ = ("healthy", "message")
__slots__ = ("healthy", "message")
...
...
python/sglang/srt/server_args.py
View file @
887c2b45
...
@@ -194,6 +194,7 @@ class ServerArgs:
...
@@ -194,6 +194,7 @@ class ServerArgs:
# HTTP server
# HTTP server
host
:
str
=
"127.0.0.1"
host
:
str
=
"127.0.0.1"
port
:
int
=
30000
port
:
int
=
30000
grpc_mode
:
bool
=
False
skip_server_warmup
:
bool
=
False
skip_server_warmup
:
bool
=
False
warmups
:
Optional
[
str
]
=
None
warmups
:
Optional
[
str
]
=
None
nccl_port
:
Optional
[
int
]
=
None
nccl_port
:
Optional
[
int
]
=
None
...
@@ -1516,6 +1517,11 @@ class ServerArgs:
...
@@ -1516,6 +1517,11 @@ class ServerArgs:
default
=
ServerArgs
.
port
,
default
=
ServerArgs
.
port
,
help
=
"The port of the HTTP server."
,
help
=
"The port of the HTTP server."
,
)
)
parser
.
add_argument
(
"--grpc-mode"
,
action
=
"store_true"
,
help
=
"If set, use gRPC server instead of HTTP server."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--skip-server-warmup"
,
"--skip-server-warmup"
,
action
=
"store_true"
,
action
=
"store_true"
,
...
...
sgl-router/src/grpc_client/sglang_scheduler.rs
View file @
887c2b45
...
@@ -169,8 +169,8 @@ impl SglangSchedulerClient {
...
@@ -169,8 +169,8 @@ impl SglangSchedulerClient {
&
self
,
&
self
,
)
->
Result
<
proto
::
HealthCheckResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
)
->
Result
<
proto
::
HealthCheckResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
debug!
(
"Sending health check request"
);
debug!
(
"Sending health check request"
);
//
Server ignores the request body and cre
ates its own health check internally
//
HealthCheckRequest is now empty - server gener
ates its own health check internally
let
request
=
Request
::
new
(
proto
::
HealthCheckRequest
{
tokenized
:
None
});
let
request
=
Request
::
new
(
proto
::
HealthCheckRequest
{});
let
mut
client
=
self
.client
.clone
();
let
mut
client
=
self
.client
.clone
();
let
response
=
client
.health_check
(
request
)
.await
?
;
let
response
=
client
.health_check
(
request
)
.await
?
;
...
@@ -510,13 +510,8 @@ mod tests {
...
@@ -510,13 +510,8 @@ mod tests {
#[test]
#[test]
fn
test_proto_types_compilation
()
{
fn
test_proto_types_compilation
()
{
let
health_req
=
proto
::
HealthCheckRequest
{
let
_
health_req
=
proto
::
HealthCheckRequest
{};
tokenized
:
Some
(
proto
::
TokenizedInput
{
// HealthCheckRequest is now empty - no fields to test
original_text
:
"test"
.to_string
(),
input_ids
:
vec!
[
1296
],
}),
};
assert
!
(
health_req
.tokenized
.is_some
());
}
}
#[test]
#[test]
...
@@ -558,13 +553,8 @@ mod tests {
...
@@ -558,13 +553,8 @@ mod tests {
#[test]
#[test]
fn
test_health_check_request
()
{
fn
test_health_check_request
()
{
let
health_req
=
proto
::
HealthCheckRequest
{
let
_
health_req
=
proto
::
HealthCheckRequest
{};
tokenized
:
Some
(
proto
::
TokenizedInput
{
// HealthCheckRequest is now empty - server generates its own test internally
original_text
:
"test"
.to_string
(),
input_ids
:
vec!
[
1296
],
// Mock token ID for "test"
}),
};
assert
!
(
health_req
.tokenized
.is_some
());
}
}
#[test]
#[test]
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
887c2b45
...
@@ -326,10 +326,7 @@ message EmbedError {
...
@@ -326,10 +326,7 @@ message EmbedError {
// Management Operations
// Management Operations
// =====================
// =====================
message
HealthCheckRequest
{
message
HealthCheckRequest
{}
// Input for health test generation (must be tokenized)
TokenizedInput
tokenized
=
1
;
}
message
HealthCheckResponse
{
message
HealthCheckResponse
{
bool
healthy
=
1
;
bool
healthy
=
1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment