Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
91678474
Unverified
Commit
91678474
authored
Sep 25, 2025
by
Chang Su
Committed by
GitHub
Sep 25, 2025
Browse files
router: Fix constraint proto and `build_constraint` in grpc router (#10881)
parent
d511b2d9
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
172 additions
and
141 deletions
+172
-141
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+4
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+9
-9
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+71
-68
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+4
-4
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+3
-0
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+9
-9
sgl-router/src/routers/grpc/router.rs
sgl-router/src/routers/grpc/router.rs
+72
-51
No files found.
python/sglang/srt/entrypoints/grpc_server.py
View file @
91678474
...
@@ -438,6 +438,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -438,6 +438,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
regex
=
None
regex
=
None
json_schema
=
None
json_schema
=
None
ebnf_grammar
=
None
ebnf_grammar
=
None
structural_tag
=
None
if
grpc_params
.
HasField
(
"regex"
):
if
grpc_params
.
HasField
(
"regex"
):
regex
=
grpc_params
.
regex
regex
=
grpc_params
.
regex
...
@@ -445,6 +446,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -445,6 +446,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
json_schema
=
grpc_params
.
json_schema
json_schema
=
grpc_params
.
json_schema
elif
grpc_params
.
HasField
(
"ebnf_grammar"
):
elif
grpc_params
.
HasField
(
"ebnf_grammar"
):
ebnf_grammar
=
grpc_params
.
ebnf_grammar
ebnf_grammar
=
grpc_params
.
ebnf_grammar
elif
grpc_params
.
HasField
(
"structural_tag"
):
structural_tag
=
grpc_params
.
structural_tag
return
SGLSamplingParams
(
return
SGLSamplingParams
(
temperature
=
grpc_params
.
temperature
or
1.0
,
temperature
=
grpc_params
.
temperature
or
1.0
,
...
@@ -465,6 +468,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -465,6 +468,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
regex
=
regex
,
regex
=
regex
,
json_schema
=
json_schema
,
json_schema
=
json_schema
,
ebnf
=
ebnf_grammar
,
ebnf
=
ebnf_grammar
,
structural_tag
=
structural_tag
,
n
=
grpc_params
.
n
or
1
,
n
=
grpc_params
.
n
or
1
,
ignore_eos
=
grpc_params
.
ignore_eos
,
ignore_eos
=
grpc_params
.
ignore_eos
,
)
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
91678474
...
@@ -47,24 +47,24 @@ message SamplingParams {
...
@@ -47,24 +47,24 @@ message SamplingParams {
string
regex
=
13
;
string
regex
=
13
;
string
json_schema
=
14
;
string
json_schema
=
14
;
string
ebnf_grammar
=
15
;
string
ebnf_grammar
=
15
;
string
structural_tag
=
16
;
}
}
// LoRA adapter
// LoRA adapter
string
lora_path
=
1
6
;
string
lora_path
=
1
7
;
// Speculative decoding
// Speculative decoding
int32
n
=
1
7
;
// Number of samples
int32
n
=
1
8
;
// Number of samples
// Token healing
// Token healing
bool
token_healing
=
1
8
;
bool
token_healing
=
1
9
;
// Additional parameters
// Additional parameters
int32
min_new_tokens
=
19
;
int32
min_new_tokens
=
20
;
bool
ignore_eos
=
20
;
bool
ignore_eos
=
21
;
bool
no_stop_trim
=
21
;
bool
no_stop_trim
=
22
;
int32
stream_interval
=
22
;
int32
stream_interval
=
23
;
map
<
string
,
float
>
logit_bias
=
23
;
map
<
string
,
float
>
logit_bias
=
24
;
string
structural_tag
=
24
;
// Custom parameters for extensibility
// Custom parameters for extensibility
google.protobuf.Struct
custom_params
=
25
;
google.protobuf.Struct
custom_params
=
25
;
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
91678474
# This file is auto-generated. Do not edit manually.
# Regenerate with: python compile_proto.py
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# NO CHECKED-IN PROTOBUF GENCODE
...
@@ -26,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
...
@@ -26,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xc
7
\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x16\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\x05\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x1
0
\x01
(
\t\x12\t\n\x01
n
\x18\x1
1
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x1
2
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x1
3
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x1
4
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x1
5
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x1
6
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x1
7
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
\x16\n\x0e
structural_tag
\x18\x18
\x01
(
\t\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraint
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe9\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\x05\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\x05\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\xf5\x01\n\x13
GenerateStreamChunk
\x12\x10\n\x08
token_id
\x18\x01
\x01
(
\x05\x12\x0c\n\x04
text
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12\x31\n\x08
logprobs
\x18\x06
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12\x15\n\r
hidden_states
\x18\x07
\x03
(
\x02\x12\x17\n\x0f
generation_time
\x18\x08
\x01
(
\x02\x12\x12\n\n
queue_time
\x18\t
\x01
(
\x05\"\xcd\x02\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\x05\x12\x13\n\x0b
output_text
\x18\x02
\x01
(
\t\x12
K
\n\r
finish_reason
\x18\x03
\x01
(
\x0e\x32\x34
.sglang.grpc.scheduler.GenerateComplete.FinishReason
\x12\x35\n\x0c\x61
ll_logprobs
\x18\x0b
\x03
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x0c
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\"
L
\n\x0c\x46
inishReason
\x12\x08\n\x04
STOP
\x10\x00\x12\n\n\x06
LENGTH
\x10\x01\x12\r\n\t
EOS_TOKEN
\x10\x02\x12\x0c\n\x08
STOP_STR
\x10\x03\x12\t\n\x05\x41\x42
ORT
\x10\x04\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"\x84\x01\n\x08
LogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\x12\x13\n\x0b
token_texts
\x18\x04
\x03
(
\t\"
E
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x13\n\x0b
token_texts
\x18\x03
\x03
(
\t\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xbc\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12\x17\n\x0f
generation_time
\x18\x05
\x01
(
\x02\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x06
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xc
9
\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x16\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\x05\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x1
1
\x01
(
\t\x12\t\n\x01
n
\x18\x1
2
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x1
3
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x1
4
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x1
5
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x1
6
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x1
7
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x1
8
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraint
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe9\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\x05\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\x05\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\xf5\x01\n\x13
GenerateStreamChunk
\x12\x10\n\x08
token_id
\x18\x01
\x01
(
\x05\x12\x0c\n\x04
text
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12\x31\n\x08
logprobs
\x18\x06
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12\x15\n\r
hidden_states
\x18\x07
\x03
(
\x02\x12\x17\n\x0f
generation_time
\x18\x08
\x01
(
\x02\x12\x12\n\n
queue_time
\x18\t
\x01
(
\x05\"\xcd\x02\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\x05\x12\x13\n\x0b
output_text
\x18\x02
\x01
(
\t\x12
K
\n\r
finish_reason
\x18\x03
\x01
(
\x0e\x32\x34
.sglang.grpc.scheduler.GenerateComplete.FinishReason
\x12\x35\n\x0c\x61
ll_logprobs
\x18\x0b
\x03
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x0c
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\"
L
\n\x0c\x46
inishReason
\x12\x08\n\x04
STOP
\x10\x00\x12\n\n\x06
LENGTH
\x10\x01\x12\r\n\t
EOS_TOKEN
\x10\x02\x12\x0c\n\x08
STOP_STR
\x10\x03\x12\t\n\x05\x41\x42
ORT
\x10\x04\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"\x84\x01\n\x08
LogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\x12\x13\n\x0b
token_texts
\x18\x04
\x03
(
\t\"
E
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x13\n\x0b
token_texts
\x18\x03
\x03
(
\t\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xbc\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12\x17\n\x0f
generation_time
\x18\x05
\x01
(
\x02\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x06
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
_globals
=
globals
()
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
@@ -36,71 +39,71 @@ if not _descriptor._USE_C_DESCRIPTORS:
...
@@ -36,71 +39,71 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_loaded_options
=
None
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_loaded_options
=
None
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_options
=
b
'8
\001
'
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_options
=
b
'8
\001
'
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_start
=
113
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_start
=
113
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_end
=
82
4
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_end
=
82
6
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_start
=
76
2
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_start
=
76
4
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_end
=
81
0
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_end
=
81
2
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_start
=
82
6
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_start
=
82
8
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_end
=
91
9
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_end
=
9
2
1
_globals
[
'_GENERATEREQUEST'
].
_serialized_start
=
92
2
_globals
[
'_GENERATEREQUEST'
].
_serialized_start
=
92
4
_globals
[
'_GENERATEREQUEST'
].
_serialized_end
=
15
39
_globals
[
'_GENERATEREQUEST'
].
_serialized_end
=
15
41
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_start
=
154
1
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_start
=
154
3
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_end
=
1
599
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_end
=
1
601
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_start
=
160
2
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_start
=
160
4
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_end
=
181
3
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_end
=
181
5
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
181
6
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
181
8
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
204
3
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
204
5
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
204
6
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
204
8
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
229
1
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
229
3
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
229
4
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
229
6
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
262
7
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
262
9
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_start
=
255
1
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_start
=
255
3
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_end
=
262
7
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_end
=
262
9
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
26
29
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
26
31
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
270
4
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
270
6
_globals
[
'_LOGPROBS'
].
_serialized_start
=
270
7
_globals
[
'_LOGPROBS'
].
_serialized_start
=
270
9
_globals
[
'_LOGPROBS'
].
_serialized_end
=
28
39
_globals
[
'_LOGPROBS'
].
_serialized_end
=
28
41
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
284
1
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
284
3
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
291
0
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
291
2
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
291
2
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
291
4
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
297
5
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
297
7
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
29
7
8
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
298
0
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
330
8
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
33
1
0
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
331
1
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
331
3
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
34
68
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
34
70
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
347
1
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
347
3
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
36
59
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
36
61
_globals
[
'_EMBEDDING'
].
_serialized_start
=
366
1
_globals
[
'_EMBEDDING'
].
_serialized_start
=
366
3
_globals
[
'_EMBEDDING'
].
_serialized_end
=
370
3
_globals
[
'_EMBEDDING'
].
_serialized_end
=
370
5
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
370
5
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
370
7
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
376
5
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
376
7
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
376
7
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
376
9
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
384
5
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
384
7
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
384
7
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
384
9
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
390
2
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
390
4
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
390
4
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
390
6
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
395
4
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
395
6
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
395
6
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
395
8
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
400
5
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
400
7
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
400
7
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
400
9
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
408
0
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
408
2
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
408
2
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
408
4
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
415
4
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
415
6
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
415
6
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
415
8
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
419
5
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
419
7
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
419
7
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
419
9
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
425
1
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
425
3
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
425
3
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
425
5
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
437
2
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
437
4
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
437
4
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
437
6
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
443
1
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
443
3
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
443
3
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
443
5
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
44
7
8
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
448
0
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
448
0
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
448
2
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
454
6
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
454
8
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
45
48
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
45
50
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
461
3
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
461
5
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
461
5
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
461
7
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
467
5
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
467
7
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
46
7
8
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
468
0
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
506
0
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
506
2
# @@protoc_insertion_point(module_scope)
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
91678474
...
@@ -12,7 +12,7 @@ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
...
@@ -12,7 +12,7 @@ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
DESCRIPTOR: _descriptor.FileDescriptor
DESCRIPTOR: _descriptor.FileDescriptor
class SamplingParams(_message.Message):
class SamplingParams(_message.Message):
__slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_new_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "regex", "json_schema", "ebnf_grammar", "lora_path", "n", "token_healing", "min_new_tokens", "ignore_eos", "no_stop_trim", "stream_interval", "logit_bias",
"structural_tag",
"custom_params")
__slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_new_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "regex", "json_schema", "ebnf_grammar",
"structural_tag",
"lora_path", "n", "token_healing", "min_new_tokens", "ignore_eos", "no_stop_trim", "stream_interval", "logit_bias", "custom_params")
class LogitBiasEntry(_message.Message):
class LogitBiasEntry(_message.Message):
__slots__ = ("key", "value")
__slots__ = ("key", "value")
KEY_FIELD_NUMBER: _ClassVar[int]
KEY_FIELD_NUMBER: _ClassVar[int]
...
@@ -35,6 +35,7 @@ class SamplingParams(_message.Message):
...
@@ -35,6 +35,7 @@ class SamplingParams(_message.Message):
REGEX_FIELD_NUMBER: _ClassVar[int]
REGEX_FIELD_NUMBER: _ClassVar[int]
JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int]
JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int]
EBNF_GRAMMAR_FIELD_NUMBER: _ClassVar[int]
EBNF_GRAMMAR_FIELD_NUMBER: _ClassVar[int]
STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int]
LORA_PATH_FIELD_NUMBER: _ClassVar[int]
LORA_PATH_FIELD_NUMBER: _ClassVar[int]
N_FIELD_NUMBER: _ClassVar[int]
N_FIELD_NUMBER: _ClassVar[int]
TOKEN_HEALING_FIELD_NUMBER: _ClassVar[int]
TOKEN_HEALING_FIELD_NUMBER: _ClassVar[int]
...
@@ -43,7 +44,6 @@ class SamplingParams(_message.Message):
...
@@ -43,7 +44,6 @@ class SamplingParams(_message.Message):
NO_STOP_TRIM_FIELD_NUMBER: _ClassVar[int]
NO_STOP_TRIM_FIELD_NUMBER: _ClassVar[int]
STREAM_INTERVAL_FIELD_NUMBER: _ClassVar[int]
STREAM_INTERVAL_FIELD_NUMBER: _ClassVar[int]
LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int]
LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int]
STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int]
CUSTOM_PARAMS_FIELD_NUMBER: _ClassVar[int]
CUSTOM_PARAMS_FIELD_NUMBER: _ClassVar[int]
temperature: float
temperature: float
top_p: float
top_p: float
...
@@ -60,6 +60,7 @@ class SamplingParams(_message.Message):
...
@@ -60,6 +60,7 @@ class SamplingParams(_message.Message):
regex: str
regex: str
json_schema: str
json_schema: str
ebnf_grammar: str
ebnf_grammar: str
structural_tag: str
lora_path: str
lora_path: str
n: int
n: int
token_healing: bool
token_healing: bool
...
@@ -68,9 +69,8 @@ class SamplingParams(_message.Message):
...
@@ -68,9 +69,8 @@ class SamplingParams(_message.Message):
no_stop_trim: bool
no_stop_trim: bool
stream_interval: int
stream_interval: int
logit_bias: _containers.ScalarMap[str, float]
logit_bias: _containers.ScalarMap[str, float]
structural_tag: str
custom_params: _struct_pb2.Struct
custom_params: _struct_pb2.Struct
def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_new_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., regex: _Optional[str] = ..., json_schema: _Optional[str] = ..., ebnf_grammar: _Optional[str] = ..., lora_path: _Optional[str] = ..., n: _Optional[int] = ..., token_healing: bool = ..., min_new_tokens: _Optional[int] = ..., ignore_eos: bool = ..., no_stop_trim: bool = ..., stream_interval: _Optional[int] = ..., logit_bias: _Optional[_Mapping[str, float]] = ...,
structural_tag: _Optional[str] = ...,
custom_params: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_new_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., regex: _Optional[str] = ..., json_schema: _Optional[str] = ..., ebnf_grammar: _Optional[str] = ...,
structural_tag: _Optional[str] = ...,
lora_path: _Optional[str] = ..., n: _Optional[int] = ..., token_healing: bool = ..., min_new_tokens: _Optional[int] = ..., ignore_eos: bool = ..., no_stop_trim: bool = ..., stream_interval: _Optional[int] = ..., logit_bias: _Optional[_Mapping[str, float]] = ..., custom_params: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
class DisaggregatedParams(_message.Message):
class DisaggregatedParams(_message.Message):
__slots__ = ("bootstrap_host", "bootstrap_port", "bootstrap_room")
__slots__ = ("bootstrap_host", "bootstrap_port", "bootstrap_room")
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
View file @
91678474
# This file is auto-generated. Do not edit manually.
# Regenerate with: python compile_proto.py
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
"""Client and server classes corresponding to protobuf-defined services."""
import
grpc
import
grpc
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
91678474
...
@@ -47,24 +47,24 @@ message SamplingParams {
...
@@ -47,24 +47,24 @@ message SamplingParams {
string
regex
=
13
;
string
regex
=
13
;
string
json_schema
=
14
;
string
json_schema
=
14
;
string
ebnf_grammar
=
15
;
string
ebnf_grammar
=
15
;
string
structural_tag
=
16
;
}
}
// LoRA adapter
// LoRA adapter
string
lora_path
=
1
6
;
string
lora_path
=
1
7
;
// Speculative decoding
// Speculative decoding
int32
n
=
1
7
;
// Number of samples
int32
n
=
1
8
;
// Number of samples
// Token healing
// Token healing
bool
token_healing
=
1
8
;
bool
token_healing
=
1
9
;
// Additional parameters
// Additional parameters
int32
min_new_tokens
=
19
;
int32
min_new_tokens
=
20
;
bool
ignore_eos
=
20
;
bool
ignore_eos
=
21
;
bool
no_stop_trim
=
21
;
bool
no_stop_trim
=
22
;
int32
stream_interval
=
22
;
int32
stream_interval
=
23
;
map
<
string
,
float
>
logit_bias
=
23
;
map
<
string
,
float
>
logit_bias
=
24
;
string
structural_tag
=
24
;
// Custom parameters for extensibility
// Custom parameters for extensibility
google.protobuf.Struct
custom_params
=
25
;
google.protobuf.Struct
custom_params
=
25
;
...
...
sgl-router/src/routers/grpc/router.rs
View file @
91678474
...
@@ -241,14 +241,14 @@ impl GrpcRouter {
...
@@ -241,14 +241,14 @@ impl GrpcRouter {
debug!
(
"Tokenized {} tokens from input"
,
token_ids
.len
());
debug!
(
"Tokenized {} tokens from input"
,
token_ids
.len
());
// Step 5: Build tool constraints if needed
// Step 5: Build tool constraints if needed
let
structural_tag
=
if
let
Some
(
tools
)
=
&
body
.tools
{
let
tool_call_constraint
=
if
let
Some
(
tools
)
=
&
body
.tools
{
self
.generate_tool_constraints
(
tools
,
&
body
.tool_choice
,
&
body
.model
)
self
.generate_tool_constraints
(
tools
,
&
body
.tool_choice
,
&
body
.model
)
}
else
{
}
else
{
None
None
};
};
// Step 6: Build SamplingParams for gRPC
// Step 6: Build SamplingParams for gRPC
let
sampling_params
=
match
self
.build_grpc_sampling_params
(
body
,
structural_tag
)
{
let
sampling_params
=
match
self
.build_grpc_sampling_params
(
body
,
tool_call_constraint
)
{
Ok
(
params
)
=>
params
,
Ok
(
params
)
=>
params
,
Err
(
e
)
=>
{
Err
(
e
)
=>
{
error!
(
"Failed to build sampling parameters: {}"
,
e
);
error!
(
"Failed to build sampling parameters: {}"
,
e
);
...
@@ -286,6 +286,41 @@ impl GrpcRouter {
...
@@ -286,6 +286,41 @@ impl GrpcRouter {
}
}
// ============ Helper Methods ============
// ============ Helper Methods ============
/// Select a worker for the request
fn
select_worker_for_request
(
&
self
,
model_id
:
Option
<&
str
>
,
text
:
Option
<&
str
>
,
)
->
Option
<
Arc
<
dyn
crate
::
core
::
Worker
>>
{
// Get workers for the specified model, filtered by connection mode
let
workers
=
self
.worker_registry
.get_workers_filtered
(
model_id
,
Some
(
WorkerType
::
Regular
),
Some
(
crate
::
core
::
ConnectionMode
::
Grpc
{
port
:
None
}),
false
,
// get all workers, we'll filter by is_available() next
);
// Filter by availability (health + circuit breaker)
let
available
:
Vec
<
Arc
<
dyn
crate
::
core
::
Worker
>>
=
workers
.iter
()
.filter
(|
w
|
w
.is_available
())
.cloned
()
.collect
();
if
available
.is_empty
()
{
return
None
;
}
// Get the appropriate policy for this model
let
policy
=
match
model_id
{
Some
(
model
)
=>
self
.policy_registry
.get_policy_or_default
(
model
),
None
=>
self
.policy_registry
.get_default_policy
(),
};
// Select worker using the policy
let
idx
=
policy
.select_worker
(
&
available
,
text
)
?
;
Some
(
available
[
idx
]
.clone
())
}
/// Process chat messages and apply template
/// Process chat messages and apply template
fn
process_chat_messages
(
fn
process_chat_messages
(
...
@@ -516,7 +551,7 @@ impl GrpcRouter {
...
@@ -516,7 +551,7 @@ impl GrpcRouter {
fn
build_grpc_sampling_params
(
fn
build_grpc_sampling_params
(
&
self
,
&
self
,
request
:
&
ChatCompletionRequest
,
request
:
&
ChatCompletionRequest
,
structural_tag
:
Option
<
String
>
,
tool_call_constraint
:
Option
<
(
String
,
String
)
>
,
)
->
Result
<
proto
::
SamplingParams
,
String
>
{
)
->
Result
<
proto
::
SamplingParams
,
String
>
{
let
stop_sequences
=
self
.extract_stop_strings
(
request
);
let
stop_sequences
=
self
.extract_stop_strings
(
request
);
...
@@ -555,8 +590,7 @@ impl GrpcRouter {
...
@@ -555,8 +590,7 @@ impl GrpcRouter {
stop_token_ids
:
request
.stop_token_ids
.clone
()
.unwrap_or_default
(),
stop_token_ids
:
request
.stop_token_ids
.clone
()
.unwrap_or_default
(),
skip_special_tokens
,
skip_special_tokens
,
n
:
request
.n
.unwrap_or
(
1
)
as
i32
,
n
:
request
.n
.unwrap_or
(
1
)
as
i32
,
structural_tag
:
structural_tag
.unwrap_or_default
(),
constraint
:
self
.build_constraint
(
request
,
tool_call_constraint
)
?
,
constraint
:
self
.build_constraint
(
request
)
?
,
..
Default
::
default
()
..
Default
::
default
()
})
})
}
}
...
@@ -574,28 +608,48 @@ impl GrpcRouter {
...
@@ -574,28 +608,48 @@ impl GrpcRouter {
fn
build_constraint
(
fn
build_constraint
(
&
self
,
&
self
,
request
:
&
ChatCompletionRequest
,
request
:
&
ChatCompletionRequest
,
tool_call_constraint
:
Option
<
(
String
,
String
)
>
,
)
->
Result
<
Option
<
proto
::
sampling_params
::
Constraint
>
,
String
>
{
)
->
Result
<
Option
<
proto
::
sampling_params
::
Constraint
>
,
String
>
{
let
mut
constraints
=
Vec
::
new
();
if
let
Some
(
ResponseFormat
::
JsonSchema
{
json_schema
})
=
&
request
.response_format
{
if
let
Some
(
ResponseFormat
::
JsonSchema
{
json_schema
})
=
&
request
.response_format
{
let
schema_str
=
serde_json
::
to_string
(
&
json_schema
.schema
)
let
schema_str
=
serde_json
::
to_string
(
&
json_schema
.schema
)
.map_err
(|
e
|
format!
(
"Failed to serialize JSON schema: {}"
,
e
))
?
;
.map_err
(|
e
|
format!
(
"Failed to serialize JSON schema: {}"
,
e
))
?
;
return
Ok
(
Some
(
proto
::
sampling_params
::
Constraint
::
JsonSchema
(
constraints
.push
(
proto
::
sampling_params
::
Constraint
::
JsonSchema
(
schema_str
));
schema_str
,
)));
}
}
if
let
Some
(
ebnf
)
=
&
request
.ebnf
{
if
let
Some
(
ebnf
)
=
&
request
.ebnf
{
return
Ok
(
Some
(
proto
::
sampling_params
::
Constraint
::
EbnfGrammar
(
constraints
.push
(
proto
::
sampling_params
::
Constraint
::
EbnfGrammar
(
ebnf
.clone
(),
ebnf
.clone
(),
))
)
;
));
}
}
if
let
Some
(
regex
)
=
&
request
.regex
{
if
let
Some
(
regex
)
=
&
request
.regex
{
return
Ok
(
Some
(
proto
::
sampling_params
::
Constraint
::
Regex
(
constraints
.push
(
proto
::
sampling_params
::
Constraint
::
Regex
(
regex
.clone
()));
regex
.clone
(),
}
)));
// Handle tool call constraint
if
let
Some
((
constraint_type
,
constraint_value
))
=
tool_call_constraint
{
if
!
constraints
.is_empty
()
{
return
Err
(
"Constrained decoding is not compatible with tool calls."
.to_string
());
}
let
tool_constraint
=
match
constraint_type
.as_str
()
{
"structural_tag"
=>
{
proto
::
sampling_params
::
Constraint
::
StructuralTag
(
constraint_value
)
}
"json_schema"
=>
proto
::
sampling_params
::
Constraint
::
JsonSchema
(
constraint_value
),
"ebnf"
=>
proto
::
sampling_params
::
Constraint
::
EbnfGrammar
(
constraint_value
),
"regex"
=>
proto
::
sampling_params
::
Constraint
::
Regex
(
constraint_value
),
_
=>
return
Err
(
format!
(
"Unknown constraint type: {}"
,
constraint_type
)),
};
constraints
.push
(
tool_constraint
);
}
}
Ok
(
None
)
match
constraints
.len
()
{
0
=>
Ok
(
None
),
1
=>
Ok
(
constraints
.pop
()),
_
=>
Err
(
"Multiple constraints are not allowed."
.to_string
()),
}
}
}
/// Generate tool constraints for structured generation
/// Generate tool constraints for structured generation
...
@@ -604,52 +658,19 @@ impl GrpcRouter {
...
@@ -604,52 +658,19 @@ impl GrpcRouter {
_
tools
:
&
[
crate
::
protocols
::
spec
::
Tool
],
_
tools
:
&
[
crate
::
protocols
::
spec
::
Tool
],
_
tool_choice
:
&
Option
<
crate
::
protocols
::
spec
::
ToolChoice
>
,
_
tool_choice
:
&
Option
<
crate
::
protocols
::
spec
::
ToolChoice
>
,
model
:
&
str
,
model
:
&
str
,
)
->
Option
<
String
>
{
)
->
Option
<
(
String
,
String
)
>
{
let
_
parser
=
self
.tool_parser_registry
.get_parser
(
model
)
?
;
let
_
parser
=
self
.tool_parser_registry
.get_parser
(
model
)
?
;
// TODO: Implement actual constraint generation logic
// For now, return None as this is placeholder implementation
None
None
}
}
/// Select a worker for the request
fn
select_worker_for_request
(
&
self
,
model_id
:
Option
<&
str
>
,
text
:
Option
<&
str
>
,
)
->
Option
<
Arc
<
dyn
crate
::
core
::
Worker
>>
{
// Get workers for the specified model, filtered by connection mode
let
workers
=
self
.worker_registry
.get_workers_filtered
(
model_id
,
Some
(
WorkerType
::
Regular
),
Some
(
crate
::
core
::
ConnectionMode
::
Grpc
{
port
:
None
}),
false
,
// get all workers, we'll filter by is_available() next
);
// Filter by availability (health + circuit breaker)
let
available
:
Vec
<
Arc
<
dyn
crate
::
core
::
Worker
>>
=
workers
.iter
()
.filter
(|
w
|
w
.is_available
())
.cloned
()
.collect
();
if
available
.is_empty
()
{
return
None
;
}
// Get the appropriate policy for this model
let
policy
=
match
model_id
{
Some
(
model
)
=>
self
.policy_registry
.get_policy_or_default
(
model
),
None
=>
self
.policy_registry
.get_default_policy
(),
};
// Select worker using the policy
let
idx
=
policy
.select_worker
(
&
available
,
text
)
?
;
Some
(
available
[
idx
]
.clone
())
}
/// Get or create a gRPC client for the worker
/// Get or create a gRPC client for the worker
async
fn
get_or_create_grpc_client
(
async
fn
get_or_create_grpc_client
(
&
self
,
&
self
,
worker_url
:
&
str
,
worker_url
:
&
str
,
)
->
Result
<
SglangSchedulerClient
,
String
>
{
)
->
Result
<
SglangSchedulerClient
,
String
>
{
// TODO: move to worker
debug!
(
"Creating new gRPC client for worker: {}"
,
worker_url
);
debug!
(
"Creating new gRPC client for worker: {}"
,
worker_url
);
SglangSchedulerClient
::
connect
(
worker_url
)
SglangSchedulerClient
::
connect
(
worker_url
)
.await
.await
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment