Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2fcd56ea
Unverified
Commit
2fcd56ea
authored
Oct 07, 2025
by
Simo Lin
Committed by
GitHub
Oct 07, 2025
Browse files
[router] add get server info and get model info in grpc server (#11303)
parent
0958a397
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
393 additions
and
3 deletions
+393
-3
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+90
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+59
-0
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+11
-3
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+62
-0
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+88
-0
sgl-router/src/grpc_client/sglang_scheduler.rs
sgl-router/src/grpc_client/sglang_scheduler.rs
+24
-0
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+59
-0
No files found.
python/sglang/srt/entrypoints/grpc_server.py
View file @
2fcd56ea
...
@@ -5,6 +5,7 @@ Uses GrpcRequestManager for orchestration without tokenization.
...
@@ -5,6 +5,7 @@ Uses GrpcRequestManager for orchestration without tokenization.
import
argparse
import
argparse
import
asyncio
import
asyncio
import
dataclasses
import
logging
import
logging
import
multiprocessing
as
mp
import
multiprocessing
as
mp
import
os
import
os
...
@@ -15,8 +16,11 @@ from typing import AsyncIterator, Dict, Optional, Tuple
...
@@ -15,8 +16,11 @@ from typing import AsyncIterator, Dict, Optional, Tuple
import
grpc
import
grpc
from
google.protobuf.json_format
import
MessageToDict
from
google.protobuf.json_format
import
MessageToDict
from
google.protobuf.struct_pb2
import
Struct
from
google.protobuf.timestamp_pb2
import
Timestamp
from
grpc_reflection.v1alpha
import
reflection
from
grpc_reflection.v1alpha
import
reflection
import
sglang
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
,
DisaggregationMode
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
,
DisaggregationMode
from
sglang.srt.entrypoints.grpc_request_manager
import
GrpcRequestManager
from
sglang.srt.entrypoints.grpc_request_manager
import
GrpcRequestManager
from
sglang.srt.grpc
import
sglang_scheduler_pb2
,
sglang_scheduler_pb2_grpc
from
sglang.srt.grpc
import
sglang_scheduler_pb2
,
sglang_scheduler_pb2_grpc
...
@@ -173,11 +177,13 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -173,11 +177,13 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
request_manager
:
GrpcRequestManager
,
request_manager
:
GrpcRequestManager
,
server_args
:
ServerArgs
,
server_args
:
ServerArgs
,
model_info
:
Dict
,
model_info
:
Dict
,
scheduler_info
:
Dict
,
):
):
"""Initialize the standalone gRPC service."""
"""Initialize the standalone gRPC service."""
self
.
request_manager
=
request_manager
self
.
request_manager
=
request_manager
self
.
server_args
=
server_args
self
.
server_args
=
server_args
self
.
model_info
=
model_info
self
.
model_info
=
model_info
self
.
scheduler_info
=
scheduler_info
self
.
start_time
=
time
.
time
()
self
.
start_time
=
time
.
time
()
# Start the request manager's event loop using auto_create_handle_loop
# Start the request manager's event loop using auto_create_handle_loop
...
@@ -396,6 +402,89 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -396,6 +402,89 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
message
=
str
(
e
),
message
=
str
(
e
),
)
)
async
def
GetModelInfo
(
self
,
request
:
sglang_scheduler_pb2
.
GetModelInfoRequest
,
context
:
grpc
.
aio
.
ServicerContext
,
)
->
sglang_scheduler_pb2
.
GetModelInfoResponse
:
"""Get model information."""
logger
.
info
(
"Model info request received"
)
is_generation
=
self
.
scheduler_info
.
get
(
"is_generation"
)
if
is_generation
is
None
:
is_generation
=
not
self
.
server_args
.
is_embedding
return
sglang_scheduler_pb2
.
GetModelInfoResponse
(
model_path
=
self
.
server_args
.
model_path
,
tokenizer_path
=
self
.
server_args
.
tokenizer_path
or
""
,
is_generation
=
is_generation
,
preferred_sampling_params
=
(
self
.
server_args
.
preferred_sampling_params
or
""
),
weight_version
=
self
.
server_args
.
weight_version
or
""
,
served_model_name
=
self
.
server_args
.
served_model_name
,
max_context_length
=
self
.
model_info
[
"max_context_length"
],
vocab_size
=
self
.
model_info
[
"vocab_size"
],
supports_vision
=
self
.
model_info
[
"supports_vision"
],
model_type
=
self
.
model_info
[
"model_type"
],
eos_token_ids
=
self
.
model_info
[
"eos_token_ids"
],
pad_token_id
=
self
.
model_info
[
"pad_token_id"
],
bos_token_id
=
self
.
model_info
[
"bos_token_id"
],
max_req_input_len
=
self
.
model_info
[
"max_req_input_len"
],
)
async
def
GetServerInfo
(
self
,
request
:
sglang_scheduler_pb2
.
GetServerInfoRequest
,
context
:
grpc
.
aio
.
ServicerContext
,
)
->
sglang_scheduler_pb2
.
GetServerInfoResponse
:
"""Get server information."""
logger
.
info
(
"Server info request received"
)
server_args_dict
=
dataclasses
.
asdict
(
self
.
server_args
)
server_args_struct
=
Struct
()
def
make_serializable
(
obj
):
if
obj
is
None
:
return
None
elif
isinstance
(
obj
,
(
str
,
int
,
float
,
bool
)):
return
obj
elif
isinstance
(
obj
,
(
list
,
tuple
,
set
)):
return
[
make_serializable
(
item
)
for
item
in
obj
]
elif
isinstance
(
obj
,
dict
):
return
{
k
:
make_serializable
(
v
)
for
k
,
v
in
obj
.
items
()}
else
:
return
str
(
obj
)
serializable_args
=
make_serializable
(
server_args_dict
)
server_args_struct
.
update
(
serializable_args
)
# Convert scheduler_info to Struct
scheduler_info_struct
=
Struct
()
scheduler_info_struct
.
update
(
self
.
scheduler_info
)
# Get runtime state from request manager
manager_state
=
self
.
request_manager
.
get_server_info
()
# Calculate uptime
uptime
=
time
.
time
()
-
self
.
start_time
# Create timestamp
start_timestamp
=
Timestamp
()
start_timestamp
.
FromSeconds
(
int
(
self
.
start_time
))
return
sglang_scheduler_pb2
.
GetServerInfoResponse
(
server_args
=
server_args_struct
,
scheduler_info
=
scheduler_info_struct
,
active_requests
=
manager_state
[
"active_requests"
],
is_paused
=
manager_state
[
"paused"
],
last_receive_timestamp
=
manager_state
[
"last_receive_time"
],
uptime_seconds
=
uptime
,
sglang_version
=
sglang
.
__version__
,
server_type
=
"grpc"
,
start_time
=
start_timestamp
,
)
# Helper methods for request/response conversion
# Helper methods for request/response conversion
def
_convert_generate_request
(
def
_convert_generate_request
(
...
@@ -756,6 +845,7 @@ async def serve_grpc(
...
@@ -756,6 +845,7 @@ async def serve_grpc(
request_manager
=
request_manager
,
request_manager
=
request_manager
,
server_args
=
server_args
,
server_args
=
server_args
,
model_info
=
model_info
,
model_info
=
model_info
,
scheduler_info
=
scheduler_info
,
)
)
sglang_scheduler_pb2_grpc
.
add_SglangSchedulerServicer_to_server
(
servicer
,
server
)
sglang_scheduler_pb2_grpc
.
add_SglangSchedulerServicer_to_server
(
servicer
,
server
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
2fcd56ea
...
@@ -20,6 +20,12 @@ service SglangScheduler {
...
@@ -20,6 +20,12 @@ service SglangScheduler {
// Abort a running request
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
// Get model information
rpc
GetModelInfo
(
GetModelInfoRequest
)
returns
(
GetModelInfoResponse
);
// Get server information
rpc
GetServerInfo
(
GetServerInfoRequest
)
returns
(
GetServerInfoResponse
);
}
}
// =====================
// =====================
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
bool
success
=
1
;
bool
success
=
1
;
string
message
=
2
;
string
message
=
2
;
}
}
// =====================
// Model and Server Info
// =====================
// Get model information
message
GetModelInfoRequest
{}
message
GetModelInfoResponse
{
string
model_path
=
1
;
string
tokenizer_path
=
2
;
bool
is_generation
=
3
;
string
preferred_sampling_params
=
4
;
// JSON string or empty
string
weight_version
=
5
;
string
served_model_name
=
6
;
int32
max_context_length
=
7
;
int32
vocab_size
=
8
;
bool
supports_vision
=
9
;
string
model_type
=
10
;
repeated
int32
eos_token_ids
=
11
;
int32
pad_token_id
=
12
;
int32
bos_token_id
=
13
;
int32
max_req_input_len
=
14
;
}
// Get server information
message
GetServerInfoRequest
{}
message
GetServerInfoResponse
{
// Server configuration (as structured data)
google.protobuf.Struct
server_args
=
1
;
// Scheduler metrics (from scheduler initialization)
google.protobuf.Struct
scheduler_info
=
2
;
// Runtime state
int32
active_requests
=
3
;
bool
is_paused
=
4
;
double
last_receive_timestamp
=
5
;
double
uptime_seconds
=
6
;
// Version info
string
sglang_version
=
7
;
// Server metadata
string
server_type
=
8
;
// "grpc"
google.protobuf.Timestamp
start_time
=
9
;
// Note: internal_states not provided in gRPC mode
// Scheduler-side metrics (memory usage, throughput) require
// bidirectional communicator infrastructure not available in gRPC.
// Use HTTP /get_server_info if scheduler internal state is needed.
}
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
2fcd56ea
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02
\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xd0\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x16\n\x0e
min_new_tokens
\x18\x12
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x13
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x14
\x01
(
\x08\x12\x1c\n\x0f
stream_interval
\x18\x15
\x01
(
\x05
H
\x02\x88\x01\x01\x12
H
\n\n
logit_bias
\x18\x16
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x17
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokensB
\x12\n\x10
_stream_interval
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x95\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
\"\x15\n\x13
GetModelInfoRequest
\"\xea\x02\n\x14
GetModelInfoResponse
\x12\x12\n\n
model_path
\x18\x01
\x01
(
\t\x12\x16\n\x0e
tokenizer_path
\x18\x02
\x01
(
\t\x12\x15\n\r
is_generation
\x18\x03
\x01
(
\x08\x12
!
\n\x19
preferred_sampling_params
\x18\x04
\x01
(
\t\x12\x16\n\x0e
weight_version
\x18\x05
\x01
(
\t\x12\x19\n\x11
served_model_name
\x18\x06
\x01
(
\t\x12\x1a\n\x12
max_context_length
\x18\x07
\x01
(
\x05\x12\x12\n\n
vocab_size
\x18\x08
\x01
(
\x05\x12\x17\n\x0f
supports_vision
\x18\t
\x01
(
\x08\x12\x12\n\n
model_type
\x18\n
\x01
(
\t\x12\x15\n\r
eos_token_ids
\x18\x0b
\x03
(
\x05\x12\x14\n\x0c
pad_token_id
\x18\x0c
\x01
(
\x05\x12\x14\n\x0c\x62
os_token_id
\x18\r
\x01
(
\x05\x12\x19\n\x11
max_req_input_len
\x18\x0e
\x01
(
\x05\"\x16\n\x14
GetServerInfoRequest
\"\xb7\x02\n\x15
GetServerInfoResponse
\x12
,
\n\x0b
server_args
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12
/
\n\x0e
scheduler_info
\x18\x02
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x17\n\x0f\x61\x63
tive_requests
\x18\x03
\x01
(
\x05\x12\x11\n\t
is_paused
\x18\x04
\x01
(
\x08\x12\x1e\n\x16
last_receive_timestamp
\x18\x05
\x01
(
\x01\x12\x16\n\x0e
uptime_seconds
\x18\x06
\x01
(
\x01\x12\x16\n\x0e
sglang_version
\x18\x07
\x01
(
\t\x12\x13\n\x0b
server_type
\x18\x08
\x01
(
\t\x12
.
\n\n
start_time
\x18\t
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp2
\xd3\x04
\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponse
\x12
g
\n\x0c
GetModelInfo
\x12
*.sglang.grpc.scheduler.GetModelInfoRequest
\x1a
+.sglang.grpc.scheduler.GetModelInfoResponse
\x12
j
\n\r
GetServerInfo
\x12
+.sglang.grpc.scheduler.GetServerInfoRequest
\x1a
,.sglang.grpc.scheduler.GetServerInfoResponse
b
\x06
proto3'
)
_globals
=
globals
()
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
@@ -106,6 +106,14 @@ if not _descriptor._USE_C_DESCRIPTORS:
...
@@ -106,6 +106,14 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
4875
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
4875
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4877
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4877
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4937
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4937
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
4940
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_start
=
4939
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
5322
_globals
[
'_GETMODELINFOREQUEST'
].
_serialized_end
=
4960
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_start
=
4963
_globals
[
'_GETMODELINFORESPONSE'
].
_serialized_end
=
5325
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_start
=
5327
_globals
[
'_GETSERVERINFOREQUEST'
].
_serialized_end
=
5349
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_start
=
5352
_globals
[
'_GETSERVERINFORESPONSE'
].
_serialized_end
=
5663
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
5666
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
6261
# @@protoc_insertion_point(module_scope)
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
2fcd56ea
...
@@ -428,3 +428,65 @@ class SetInternalStateResponse(_message.Message):
...
@@ -428,3 +428,65 @@ class SetInternalStateResponse(_message.Message):
success: bool
success: bool
message: str
message: str
def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
class GetModelInfoRequest(_message.Message):
__slots__ = ()
def __init__(self) -> None: ...
class GetModelInfoResponse(_message.Message):
__slots__ = ("model_path", "tokenizer_path", "is_generation", "preferred_sampling_params", "weight_version", "served_model_name", "max_context_length", "vocab_size", "supports_vision", "model_type", "eos_token_ids", "pad_token_id", "bos_token_id", "max_req_input_len")
MODEL_PATH_FIELD_NUMBER: _ClassVar[int]
TOKENIZER_PATH_FIELD_NUMBER: _ClassVar[int]
IS_GENERATION_FIELD_NUMBER: _ClassVar[int]
PREFERRED_SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int]
WEIGHT_VERSION_FIELD_NUMBER: _ClassVar[int]
SERVED_MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int]
VOCAB_SIZE_FIELD_NUMBER: _ClassVar[int]
SUPPORTS_VISION_FIELD_NUMBER: _ClassVar[int]
MODEL_TYPE_FIELD_NUMBER: _ClassVar[int]
EOS_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
PAD_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
BOS_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MAX_REQ_INPUT_LEN_FIELD_NUMBER: _ClassVar[int]
model_path: str
tokenizer_path: str
is_generation: bool
preferred_sampling_params: str
weight_version: str
served_model_name: str
max_context_length: int
vocab_size: int
supports_vision: bool
model_type: str
eos_token_ids: _containers.RepeatedScalarFieldContainer[int]
pad_token_id: int
bos_token_id: int
max_req_input_len: int
def __init__(self, model_path: _Optional[str] = ..., tokenizer_path: _Optional[str] = ..., is_generation: bool = ..., preferred_sampling_params: _Optional[str] = ..., weight_version: _Optional[str] = ..., served_model_name: _Optional[str] = ..., max_context_length: _Optional[int] = ..., vocab_size: _Optional[int] = ..., supports_vision: bool = ..., model_type: _Optional[str] = ..., eos_token_ids: _Optional[_Iterable[int]] = ..., pad_token_id: _Optional[int] = ..., bos_token_id: _Optional[int] = ..., max_req_input_len: _Optional[int] = ...) -> None: ...
class GetServerInfoRequest(_message.Message):
__slots__ = ()
def __init__(self) -> None: ...
class GetServerInfoResponse(_message.Message):
__slots__ = ("server_args", "scheduler_info", "active_requests", "is_paused", "last_receive_timestamp", "uptime_seconds", "sglang_version", "server_type", "start_time")
SERVER_ARGS_FIELD_NUMBER: _ClassVar[int]
SCHEDULER_INFO_FIELD_NUMBER: _ClassVar[int]
ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int]
IS_PAUSED_FIELD_NUMBER: _ClassVar[int]
LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int]
UPTIME_SECONDS_FIELD_NUMBER: _ClassVar[int]
SGLANG_VERSION_FIELD_NUMBER: _ClassVar[int]
SERVER_TYPE_FIELD_NUMBER: _ClassVar[int]
START_TIME_FIELD_NUMBER: _ClassVar[int]
server_args: _struct_pb2.Struct
scheduler_info: _struct_pb2.Struct
active_requests: int
is_paused: bool
last_receive_timestamp: float
uptime_seconds: float
sglang_version: str
server_type: str
start_time: _timestamp_pb2.Timestamp
def __init__(self, server_args: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., scheduler_info: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., active_requests: _Optional[int] = ..., is_paused: bool = ..., last_receive_timestamp: _Optional[float] = ..., uptime_seconds: _Optional[float] = ..., sglang_version: _Optional[str] = ..., server_type: _Optional[str] = ..., start_time: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ...) -> None: ...
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
View file @
2fcd56ea
...
@@ -59,6 +59,16 @@ class SglangSchedulerStub(object):
...
@@ -59,6 +59,16 @@ class SglangSchedulerStub(object):
request_serializer
=
sglang__scheduler__pb2
.
AbortRequest
.
SerializeToString
,
request_serializer
=
sglang__scheduler__pb2
.
AbortRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
AbortResponse
.
FromString
,
response_deserializer
=
sglang__scheduler__pb2
.
AbortResponse
.
FromString
,
_registered_method
=
True
)
_registered_method
=
True
)
self
.
GetModelInfo
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo'
,
request_serializer
=
sglang__scheduler__pb2
.
GetModelInfoRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
GetModelInfoResponse
.
FromString
,
_registered_method
=
True
)
self
.
GetServerInfo
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo'
,
request_serializer
=
sglang__scheduler__pb2
.
GetServerInfoRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
GetServerInfoResponse
.
FromString
,
_registered_method
=
True
)
class
SglangSchedulerServicer
(
object
):
class
SglangSchedulerServicer
(
object
):
...
@@ -94,6 +104,20 @@ class SglangSchedulerServicer(object):
...
@@ -94,6 +104,20 @@ class SglangSchedulerServicer(object):
context
.
set_details
(
'Method not implemented!'
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
GetModelInfo
(
self
,
request
,
context
):
"""Get model information
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
GetServerInfo
(
self
,
request
,
context
):
"""Get server information
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
add_SglangSchedulerServicer_to_server
(
servicer
,
server
):
def
add_SglangSchedulerServicer_to_server
(
servicer
,
server
):
rpc_method_handlers
=
{
rpc_method_handlers
=
{
...
@@ -117,6 +141,16 @@ def add_SglangSchedulerServicer_to_server(servicer, server):
...
@@ -117,6 +141,16 @@ def add_SglangSchedulerServicer_to_server(servicer, server):
request_deserializer
=
sglang__scheduler__pb2
.
AbortRequest
.
FromString
,
request_deserializer
=
sglang__scheduler__pb2
.
AbortRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
AbortResponse
.
SerializeToString
,
response_serializer
=
sglang__scheduler__pb2
.
AbortResponse
.
SerializeToString
,
),
),
'GetModelInfo'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
GetModelInfo
,
request_deserializer
=
sglang__scheduler__pb2
.
GetModelInfoRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
GetModelInfoResponse
.
SerializeToString
,
),
'GetServerInfo'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
GetServerInfo
,
request_deserializer
=
sglang__scheduler__pb2
.
GetServerInfoRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
GetServerInfoResponse
.
SerializeToString
,
),
}
}
generic_handler
=
grpc
.
method_handlers_generic_handler
(
generic_handler
=
grpc
.
method_handlers_generic_handler
(
'sglang.grpc.scheduler.SglangScheduler'
,
rpc_method_handlers
)
'sglang.grpc.scheduler.SglangScheduler'
,
rpc_method_handlers
)
...
@@ -237,3 +271,57 @@ class SglangScheduler(object):
...
@@ -237,3 +271,57 @@ class SglangScheduler(object):
timeout
,
timeout
,
metadata
,
metadata
,
_registered_method
=
True
)
_registered_method
=
True
)
@
staticmethod
def
GetModelInfo
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo'
,
sglang__scheduler__pb2
.
GetModelInfoRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
GetModelInfoResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
GetServerInfo
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo'
,
sglang__scheduler__pb2
.
GetServerInfoRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
GetServerInfoResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
sgl-router/src/grpc_client/sglang_scheduler.rs
View file @
2fcd56ea
...
@@ -97,6 +97,30 @@ impl SglangSchedulerClient {
...
@@ -97,6 +97,30 @@ impl SglangSchedulerClient {
Ok
(())
Ok
(())
}
}
/// Get model information
pub
async
fn
get_model_info
(
&
mut
self
,
)
->
Result
<
proto
::
GetModelInfoResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
debug!
(
"Requesting model info"
);
let
request
=
Request
::
new
(
proto
::
GetModelInfoRequest
{});
let
response
=
self
.client
.get_model_info
(
request
)
.await
?
;
debug!
(
"Model info response received"
);
Ok
(
response
.into_inner
())
}
/// Get server information
pub
async
fn
get_server_info
(
&
mut
self
,
)
->
Result
<
proto
::
GetServerInfoResponse
,
Box
<
dyn
std
::
error
::
Error
+
Send
+
Sync
>>
{
debug!
(
"Requesting server info"
);
let
request
=
Request
::
new
(
proto
::
GetServerInfoRequest
{});
let
response
=
self
.client
.get_server_info
(
request
)
.await
?
;
debug!
(
"Server info response received"
);
Ok
(
response
.into_inner
())
}
/// Build a single SGLang GenerateRequest from OpenAI ChatCompletionRequest
/// Build a single SGLang GenerateRequest from OpenAI ChatCompletionRequest
pub
fn
build_generate_request
(
pub
fn
build_generate_request
(
&
self
,
&
self
,
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
2fcd56ea
...
@@ -20,6 +20,12 @@ service SglangScheduler {
...
@@ -20,6 +20,12 @@ service SglangScheduler {
// Abort a running request
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
// Get model information
rpc
GetModelInfo
(
GetModelInfoRequest
)
returns
(
GetModelInfoResponse
);
// Get server information
rpc
GetServerInfo
(
GetServerInfoRequest
)
returns
(
GetServerInfoResponse
);
}
}
// =====================
// =====================
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
...
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
bool
success
=
1
;
bool
success
=
1
;
string
message
=
2
;
string
message
=
2
;
}
}
// =====================
// Model and Server Info
// =====================
// Get model information
message
GetModelInfoRequest
{}
message
GetModelInfoResponse
{
string
model_path
=
1
;
string
tokenizer_path
=
2
;
bool
is_generation
=
3
;
string
preferred_sampling_params
=
4
;
// JSON string or empty
string
weight_version
=
5
;
string
served_model_name
=
6
;
int32
max_context_length
=
7
;
int32
vocab_size
=
8
;
bool
supports_vision
=
9
;
string
model_type
=
10
;
repeated
int32
eos_token_ids
=
11
;
int32
pad_token_id
=
12
;
int32
bos_token_id
=
13
;
int32
max_req_input_len
=
14
;
}
// Get server information
message
GetServerInfoRequest
{}
message
GetServerInfoResponse
{
// Server configuration (as structured data)
google.protobuf.Struct
server_args
=
1
;
// Scheduler metrics (from scheduler initialization)
google.protobuf.Struct
scheduler_info
=
2
;
// Runtime state
int32
active_requests
=
3
;
bool
is_paused
=
4
;
double
last_receive_timestamp
=
5
;
double
uptime_seconds
=
6
;
// Version info
string
sglang_version
=
7
;
// Server metadata
string
server_type
=
8
;
// "grpc"
google.protobuf.Timestamp
start_time
=
9
;
// Note: internal_states not provided in gRPC mode
// Scheduler-side metrics (memory usage, throughput) require
// bidirectional communicator infrastructure not available in gRPC.
// Use HTTP /get_server_info if scheduler internal state is needed.
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment