Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2fcd56ea
Unverified
Commit
2fcd56ea
authored
Oct 07, 2025
by
Simo Lin
Committed by
GitHub
Oct 07, 2025
Browse files
[router] add get server info and get model info in grpc server (#11303)
parent
0958a397
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
393 additions
and
3 deletions
+393
-3
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+90
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+59
-0
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+11
-3
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+62
-0
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+88
-0
sgl-router/src/grpc_client/sglang_scheduler.rs
sgl-router/src/grpc_client/sglang_scheduler.rs
+24
-0
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+59
-0
No files found.
python/sglang/srt/entrypoints/grpc_server.py
View file @
2fcd56ea
...
...
@@ -5,6 +5,7 @@ Uses GrpcRequestManager for orchestration without tokenization.
import
argparse
import
asyncio
import
dataclasses
import
logging
import
multiprocessing
as
mp
import
os
...
...
@@ -15,8 +16,11 @@ from typing import AsyncIterator, Dict, Optional, Tuple
import
grpc
from
google.protobuf.json_format
import
MessageToDict
from
google.protobuf.struct_pb2
import
Struct
from
google.protobuf.timestamp_pb2
import
Timestamp
from
grpc_reflection.v1alpha
import
reflection
import
sglang
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
,
DisaggregationMode
from
sglang.srt.entrypoints.grpc_request_manager
import
GrpcRequestManager
from
sglang.srt.grpc
import
sglang_scheduler_pb2
,
sglang_scheduler_pb2_grpc
...
...
@@ -173,11 +177,13 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
request_manager
:
GrpcRequestManager
,
server_args
:
ServerArgs
,
model_info
:
Dict
,
scheduler_info
:
Dict
,
):
"""Initialize the standalone gRPC service."""
self
.
request_manager
=
request_manager
self
.
server_args
=
server_args
self
.
model_info
=
model_info
self
.
scheduler_info
=
scheduler_info
self
.
start_time
=
time
.
time
()
# Start the request manager's event loop using auto_create_handle_loop
...
...
@@ -396,6 +402,89 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
message
=
str
(
e
),
)
async
def
GetModelInfo
(
self
,
request
:
sglang_scheduler_pb2
.
GetModelInfoRequest
,
context
:
grpc
.
aio
.
ServicerContext
,
)
->
sglang_scheduler_pb2
.
GetModelInfoResponse
:
"""Get model information."""
logger
.
info
(
"Model info request received"
)
is_generation
=
self
.
scheduler_info
.
get
(
"is_generation"
)
if
is_generation
is
None
:
is_generation
=
not
self
.
server_args
.
is_embedding
return
sglang_scheduler_pb2
.
GetModelInfoResponse
(
model_path
=
self
.
server_args
.
model_path
,
tokenizer_path
=
self
.
server_args
.
tokenizer_path
or
""
,
is_generation
=
is_generation
,
preferred_sampling_params
=
(
self
.
server_args
.
preferred_sampling_params
or
""
),
weight_version
=
self
.
server_args
.
weight_version
or
""
,
served_model_name
=
self
.
server_args
.
served_model_name
,
max_context_length
=
self
.
model_info
[
"max_context_length"
],
vocab_size
=
self
.
model_info
[
"vocab_size"
],
supports_vision
=
self
.
model_info
[
"supports_vision"
],
model_type
=
self
.
model_info
[
"model_type"
],
eos_token_ids
=
self
.
model_info
[
"eos_token_ids"
],
pad_token_id
=
self
.
model_info
[
"pad_token_id"
],
bos_token_id
=
self
.
model_info
[
"bos_token_id"
],
max_req_input_len
=
self
.
model_info
[
"max_req_input_len"
],
)
async
def
GetServerInfo
(
self
,
request
:
sglang_scheduler_pb2
.
GetServerInfoRequest
,
context
:
grpc
.
aio
.
ServicerContext
,
)
->
sglang_scheduler_pb2
.
GetServerInfoResponse
:
"""Get server information."""
logger
.
info
(
"Server info request received"
)
server_args_dict
=
dataclasses
.
asdict
(
self
.
server_args
)
server_args_struct
=
Struct
()
def
make_serializable
(
obj
):
if
obj
is
None
:
return
None
elif
isinstance
(
obj
,
(
str
,
int
,
float
,
bool
)):
return
obj
elif
isinstance
(
obj
,
(
list
,
tuple
,
set
)):
return
[
make_serializable
(
item
)
for
item
in
obj
]
elif
isinstance
(
obj
,
dict
):
return
{
k
:
make_serializable
(
v
)
for
k
,
v
in
obj
.
items
()}
else
:
return
str
(
obj
)
serializable_args
=
make_serializable
(
server_args_dict
)
server_args_struct
.
update
(
serializable_args
)
# Convert scheduler_info to Struct
scheduler_info_struct
=
Struct
()
scheduler_info_struct
.
update
(
self
.
scheduler_info
)
# Get runtime state from request manager
manager_state
=
self
.
request_manager
.
get_server_info
()
# Calculate uptime
uptime
=
time
.
time
()
-
self
.
start_time
# Create timestamp
start_timestamp
=
Timestamp
()
start_timestamp
.
FromSeconds
(
int
(
self
.
start_time
))
return
sglang_scheduler_pb2
.
GetServerInfoResponse
(
server_args
=
server_args_struct
,
scheduler_info
=
scheduler_info_struct
,
active_requests
=
manager_state
[
"active_requests"
],
is_paused
=
manager_state
[
"paused"
],
last_receive_timestamp
=
manager_state
[
"last_receive_time"
],
uptime_seconds
=
uptime
,
sglang_version
=
sglang
.
__version__
,
server_type
=
"grpc"
,
start_time
=
start_timestamp
,
)
# Helper methods for request/response conversion
def
_convert_generate_request
(
...
...
@@ -756,6 +845,7 @@ async def serve_grpc(
request_manager
=
request_manager
,
server_args
=
server_args
,
model_info
=
model_info
,
scheduler_info
=
scheduler_info
,
)
sglang_scheduler_pb2_grpc
.
add_SglangSchedulerServicer_to_server
(
servicer
,
server
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
2fcd56ea
...
...
@@ -20,6 +20,12 @@ service SglangScheduler {
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
// Get model information
rpc
GetModelInfo
(
GetModelInfoRequest
)
returns
(
GetModelInfoResponse
);
// Get server information
rpc
GetServerInfo
(
GetServerInfoRequest
)
returns
(
GetServerInfoResponse
);
}
// =====================
...
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
bool
success
=
1
;
string
message
=
2
;
}
// =====================
// Model and Server Info
// =====================
// Get model information
message
GetModelInfoRequest
{}
message
GetModelInfoResponse
{
string
model_path
=
1
;
string
tokenizer_path
=
2
;
bool
is_generation
=
3
;
string
preferred_sampling_params
=
4
;
// JSON string or empty
string
weight_version
=
5
;
string
served_model_name
=
6
;
int32
max_context_length
=
7
;
int32
vocab_size
=
8
;
bool
supports_vision
=
9
;
string
model_type
=
10
;
repeated
int32
eos_token_ids
=
11
;
int32
pad_token_id
=
12
;
int32
bos_token_id
=
13
;
int32
max_req_input_len
=
14
;
}
// Get server information
message
GetServerInfoRequest
{}
message
GetServerInfoResponse
{
// Server configuration (as structured data)
google.protobuf.Struct
server_args
=
1
;
// Scheduler metrics (from scheduler initialization)
google.protobuf.Struct
scheduler_info
=
2
;
// Runtime state
int32
active_requests
=
3
;
bool
is_paused
=
4
;
double
last_receive_timestamp
=
5
;
double
uptime_seconds
=
6
;
// Version info
string
sglang_version
=
7
;
// Server metadata
string
server_type
=
8
;
// "grpc"
google.protobuf.Timestamp
start_time
=
9
;
// Note: internal_states not provided in gRPC mode
// Scheduler-side metrics (memory usage, throughput) require
// bidirectional communicator infrastructure not available in gRPC.
// Use HTTP /get_server_info if scheduler internal state is needed.
}
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
2fcd56ea
...
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02
\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
\"\x15\n\x13
GetModelInfoRequest
\"\xea\x02\n\x14
GetModelInfoResponse
\x12\x12\n\n
model_path
\x18\x01
\x01
(
\t\x12\x16\n\x0e
tokenizer_path
\x18\x02
\x01
(
\t\x12\x15\n\r
is_generation
\x18\x03
\x01
(
\x08\x12
!
\n\x19
preferred_sampling_params
\x18\x04
\x01
(
\t\x12\x16\n\x0e
weight_version
\x18\x05
\x01
(
\t\x12\x19\n\x11
served_model_name
\x18\x06
\x01
(
\t\x12\x1a\n\x12
max_context_length
\x18\x07
\x01
(
\x05\x12\x12\n\n
vocab_size
\x18\x08
\x01
(
\x05\x12\x17\n\x0f
supports_vision
\x18\t
\x01
(
\x08\x12\x12\n\n
model_type
\x18\n
\x01
(
\t\x12\x15\n\r
eos_token_ids
\x18\x0b
\x03
(
\x05\x12\x14\n\x0c
pad_token_id
\x18\x0c
\x01
(
\x05\x12\x14\n\x0c\x62
os_token_id
\x18\r
\x01
(
\x05\x12\x19\n\x11
max_req_input_len
\x18\x0e
\x01
(
\x05\"\x16\n\x14
GetServerInfoRequest
\"\xb7\x02\n\x15
GetServerInfoResponse
\x12
,
\n\x0b
server_args
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12
/
\n\x0e
scheduler_info
\x18\x02
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x17\n\x0f\x61\x63
tive_requests
\x18\x03
\x01
(
\x05\x12\x11\n\t
is_paused
\x18\x04
\x01
(
\x08\x12\x1e\n\x16
last_receive_timestamp
\x18\x05
\x01
(
\x01\x12\x16\n\x0e
uptime_seconds
\x18\x06
\x01
(
\x01\x12\x16\n\x0e
sglang_version
\x18\x07
\x01
(
\t\x12\x13\n\x0b
server_type
\x18\x08
\x01
(
\t\x12
.
\n\n
start_time
\x18\t
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp2
\xd3\x04
\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponse
\x12
g
\n\x0c
GetModelInfo
\x12
*.sglang.grpc.scheduler.GetModelInfoRequest
\x1a
+.sglang.grpc.scheduler.GetModelInfoResponse
\x12
j
\n\r
GetServerInfo
\x12
+.sglang.grpc.scheduler.GetServerInfoRequest
\x1a
,.sglang.grpc.scheduler.GetServerInfoResponse
b
\x06
proto3'
)
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
...
@@ -106,6 +106,14 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
4875
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4877
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4937
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
4940
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
5322
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_start
=
4939
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_end
=
4960
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_start
=
4963
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_end
=
5325
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_start
=
5327
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_end
=
5349
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_start
=
5352
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_end
=
5663
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
5666
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
6261
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
2fcd56ea
...
...
@@ -428,3 +428,65 @@ class SetInternalStateResponse(_message.Message):
success: bool
message: str
def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
class GetModelInfoRequest(_message.Message):
__slots__ = ()
def __init__(self) -> None: ...
class GetModelInfoResponse(_message.Message):
__slots__ = ("model_path", "tokenizer_path", "is_generation", "preferred_sampling_params", "weight_version", "served_model_name", "max_context_length", "vocab_size", "supports_vision", "model_type", "eos_token_ids", "pad_token_id", "bos_token_id", "max_req_input_len")
MODEL_PATH_FIELD_NUMBER: _ClassVar[int]
TOKENIZER_PATH_FIELD_NUMBER: _ClassVar[int]
IS_GENERATION_FIELD_NUMBER: _ClassVar[int]
PREFERRED_SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int]
WEIGHT_VERSION_FIELD_NUMBER: _ClassVar[int]
SERVED_MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int]
VOCAB_SIZE_FIELD_NUMBER: _ClassVar[int]
SUPPORTS_VISION_FIELD_NUMBER: _ClassVar[int]
MODEL_TYPE_FIELD_NUMBER: _ClassVar[int]
EOS_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
PAD_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
BOS_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MAX_REQ_INPUT_LEN_FIELD_NUMBER: _ClassVar[int]
model_path: str
tokenizer_path: str
is_generation: bool
preferred_sampling_params: str
weight_version: str
served_model_name: str
max_context_length: int
vocab_size: int
supports_vision: bool
model_type: str
eos_token_ids: _containers.RepeatedScalarFieldContainer[int]
pad_token_id: int
bos_token_id: int
max_req_input_len: int
def __init__(self, model_path: _Optional[str] = ..., tokenizer_path: _Optional[str] = ..., is_generation: bool = ..., preferred_sampling_params: _Optional[str] = ..., weight_version: _Optional[str] = ..., served_model_name: _Optional[str] = ..., max_context_length: _Optional[int] = ..., vocab_size: _Optional[int] = ..., supports_vision: bool = ..., model_type: _Optional[str] = ..., eos_token_ids: _Optional[_Iterable[int]] = ..., pad_token_id: _Optional[int] = ..., bos_token_id: _Optional[int] = ..., max_req_input_len: _Optional[int] = ...) -> None: ...
class GetServerInfoRequest(_message.Message):
__slots__ = ()
def __init__(self) -> None: ...
class GetServerInfoResponse(_message.Message):
__slots__ = ("server_args", "scheduler_info", "active_requests", "is_paused", "last_receive_timestamp", "uptime_seconds", "sglang_version", "server_type", "start_time")
SERVER_ARGS_FIELD_NUMBER: _ClassVar[int]
SCHEDULER_INFO_FIELD_NUMBER: _ClassVar[int]
ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int]
IS_PAUSED_FIELD_NUMBER: _ClassVar[int]
LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int]
UPTIME_SECONDS_FIELD_NUMBER: _ClassVar[int]
SGLANG_VERSION_FIELD_NUMBER: _ClassVar[int]
SERVER_TYPE_FIELD_NUMBER: _ClassVar[int]
START_TIME_FIELD_NUMBER: _ClassVar[int]
server_args: _struct_pb2.Struct
scheduler_info: _struct_pb2.Struct
active_requests: int
is_paused: bool
last_receive_timestamp: float
uptime_seconds: float
sglang_version: str
server_type: str
start_time: _timestamp_pb2.Timestamp
def __init__(self, server_args: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., scheduler_info: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., active_requests: _Optional[int] = ..., is_paused: bool = ..., last_receive_timestamp: _Optional[float] = ..., uptime_seconds: _Optional[float] = ..., sglang_version: _Optional[str] = ..., server_type: _Optional[str] = ..., start_time: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ...) -> None: ...
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
View file @
2fcd56ea
...
...
@@ -59,6 +59,16 @@ class SglangSchedulerStub(object):
request_serializer
=
sglang__scheduler__pb2
.
AbortRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
AbortResponse
.
FromString
,
_registered_method
=
True
)
self
.
GetModelInfo
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo'
,
request_serializer
=
sglang__scheduler__pb2
.
GetModelInfoRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
GetModelInfoResponse
.
FromString
,
_registered_method
=
True
)
self
.
GetServerInfo
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo'
,
request_serializer
=
sglang__scheduler__pb2
.
GetServerInfoRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
GetServerInfoResponse
.
FromString
,
_registered_method
=
True
)
class
SglangSchedulerServicer
(
object
):
...
...
@@ -94,6 +104,20 @@ class SglangSchedulerServicer(object):
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
GetModelInfo
(
self
,
request
,
context
):
"""Get model information
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
GetServerInfo
(
self
,
request
,
context
):
"""Get server information
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
add_SglangSchedulerServicer_to_server
(
servicer
,
server
):
rpc_method_handlers
=
{
...
...
@@ -117,6 +141,16 @@ def add_SglangSchedulerServicer_to_server(servicer, server):
request_deserializer
=
sglang__scheduler__pb2
.
AbortRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
AbortResponse
.
SerializeToString
,
),
'GetModelInfo'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
GetModelInfo
,
request_deserializer
=
sglang__scheduler__pb2
.
GetModelInfoRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
GetModelInfoResponse
.
SerializeToString
,
),
'GetServerInfo'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
GetServerInfo
,
request_deserializer
=
sglang__scheduler__pb2
.
GetServerInfoRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
GetServerInfoResponse
.
SerializeToString
,
),
}
generic_handler
=
grpc
.
method_handlers_generic_handler
(
'sglang.grpc.scheduler.SglangScheduler'
,
rpc_method_handlers
)
...
...
@@ -237,3 +271,57 @@ class SglangScheduler(object):
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
GetModelInfo
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo'
,
sglang__scheduler__pb2
.
GetModelInfoRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
GetModelInfoResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
GetServerInfo
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo'
,
sglang__scheduler__pb2
.
GetServerInfoRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
GetServerInfoResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
sgl-router/src/grpc_client/sglang_scheduler.rs
View file @
2fcd56ea
...
...
@@ -97,6 +97,30 @@ impl SglangSchedulerClient {
Ok
(())
}
/// Get model information
pub
async
fn
get_model_info
(
&
mut
self
,
)
->
Result
<
proto
::
GetModelInfoResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
debug!
(
"Requesting model info"
);
let
request
=
Request
::
new
(
proto
::
GetModelInfoRequest
{});
let
response
=
self
.client
.get_model_info
(
request
)
.await
?
;
debug!
(
"Model info response received"
);
Ok
(
response
.into_inner
())
}
/// Get server information
pub
async
fn
get_server_info
(
&
mut
self
,
)
->
Result
<
proto
::
GetServerInfoResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
debug!
(
"Requesting server info"
);
let
request
=
Request
::
new
(
proto
::
GetServerInfoRequest
{});
let
response
=
self
.client
.get_server_info
(
request
)
.await
?
;
debug!
(
"Server info response received"
);
Ok
(
response
.into_inner
())
}
/// Build a single SGLang GenerateRequest from OpenAI ChatCompletionRequest
pub
fn
build_generate_request
(
&
self
,
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
2fcd56ea
...
...
@@ -20,6 +20,12 @@ service SglangScheduler {
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
// Get model information
rpc
GetModelInfo
(
GetModelInfoRequest
)
returns
(
GetModelInfoResponse
);
// Get server information
rpc
GetServerInfo
(
GetServerInfoRequest
)
returns
(
GetServerInfoResponse
);
}
// =====================
...
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
bool
success
=
1
;
string
message
=
2
;
}
// =====================
// Model and Server Info
// =====================
// Get model information
message
GetModelInfoRequest
{}
message
GetModelInfoResponse
{
string
model_path
=
1
;
string
tokenizer_path
=
2
;
bool
is_generation
=
3
;
string
preferred_sampling_params
=
4
;
// JSON string or empty
string
weight_version
=
5
;
string
served_model_name
=
6
;
int32
max_context_length
=
7
;
int32
vocab_size
=
8
;
bool
supports_vision
=
9
;
string
model_type
=
10
;
repeated
int32
eos_token_ids
=
11
;
int32
pad_token_id
=
12
;
int32
bos_token_id
=
13
;
int32
max_req_input_len
=
14
;
}
// Get server information
message
GetServerInfoRequest
{}
message
GetServerInfoResponse
{
// Server configuration (as structured data)
google.protobuf.Struct
server_args
=
1
;
// Scheduler metrics (from scheduler initialization)
google.protobuf.Struct
scheduler_info
=
2
;
// Runtime state
int32
active_requests
=
3
;
bool
is_paused
=
4
;
double
last_receive_timestamp
=
5
;
double
uptime_seconds
=
6
;
// Version info
string
sglang_version
=
7
;
// Server metadata
string
server_type
=
8
;
// "grpc"
google.protobuf.Timestamp
start_time
=
9
;
// Note: internal_states not provided in gRPC mode
// Scheduler-side metrics (memory usage, throughput) require
// bidirectional communicator infrastructure not available in gRPC.
// Use HTTP /get_server_info if scheduler internal state is needed.
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment