Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
963175d5
Unverified
Commit
963175d5
authored
Oct 02, 2025
by
Chang Su
Committed by
GitHub
Oct 02, 2025
Browse files
[router][grpc] Support streaming for v1/chat/completions (#11179)
parent
0618ad6d
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
884 additions
and
179 deletions
+884
-179
python/sglang/srt/entrypoints/grpc_request_manager.py
python/sglang/srt/entrypoints/grpc_request_manager.py
+1
-1
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+2
-1
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+6
-0
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+56
-56
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+8
-4
sgl-router/benches/request_processing.rs
sgl-router/benches/request_processing.rs
+0
-1
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+6
-0
sgl-router/src/protocols/spec.rs
sgl-router/src/protocols/spec.rs
+2
-4
sgl-router/src/reasoning_parser/README.md
sgl-router/src/reasoning_parser/README.md
+6
-6
sgl-router/src/reasoning_parser/factory.rs
sgl-router/src/reasoning_parser/factory.rs
+16
-16
sgl-router/src/reasoning_parser/mod.rs
sgl-router/src/reasoning_parser/mod.rs
+1
-1
sgl-router/src/routers/grpc/pd_router.rs
sgl-router/src/routers/grpc/pd_router.rs
+2
-2
sgl-router/src/routers/grpc/router.rs
sgl-router/src/routers/grpc/router.rs
+716
-54
sgl-router/src/server.rs
sgl-router/src/server.rs
+3
-3
sgl-router/src/tool_parser/parsers/deepseek_parser.rs
sgl-router/src/tool_parser/parsers/deepseek_parser.rs
+4
-5
sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs
sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs
+4
-5
sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs
sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs
+0
-5
sgl-router/src/tool_parser/parsers/helpers.rs
sgl-router/src/tool_parser/parsers/helpers.rs
+42
-0
sgl-router/src/tool_parser/parsers/json_parser.rs
sgl-router/src/tool_parser/parsers/json_parser.rs
+5
-10
sgl-router/src/tool_parser/parsers/kimik2_parser.rs
sgl-router/src/tool_parser/parsers/kimik2_parser.rs
+4
-5
No files found.
python/sglang/srt/entrypoints/grpc_request_manager.py
View file @
963175d5
...
...
@@ -578,7 +578,7 @@ class GrpcRequestManager:
batch_out
.
cached_tokens
[
i
]
if
batch_out
.
cached_tokens
else
0
),
"finish_reason"
:
(
str
(
batch_out
.
finished_reasons
[
i
]
)
batch_out
.
finished_reasons
[
i
]
if
batch_out
.
finished_reasons
[
i
]
else
None
),
...
...
python/sglang/srt/entrypoints/grpc_server.py
View file @
963175d5
...
...
@@ -112,7 +112,6 @@ def _launch_scheduler_process_only(
pp_rank
,
None
,
writer
,
None
,
),
)
...
...
@@ -583,6 +582,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
output_logprobs
=
output_logprobs_proto
,
input_logprobs
=
input_logprobs_proto
,
index
=
output
.
get
(
"index"
,
0
),
),
)
...
...
@@ -640,6 +640,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
output_logprobs
=
output_logprobs_proto
,
input_logprobs
=
input_logprobs_proto
,
index
=
output
.
get
(
"index"
,
0
),
**
matched_stop_kwargs
,
),
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
963175d5
...
...
@@ -179,6 +179,9 @@ message GenerateStreamChunk {
// Input logprobs (if requested) - only in first chunk
InputLogProbs
input_logprobs
=
7
;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32
index
=
8
;
}
message
GenerateComplete
{
...
...
@@ -207,6 +210,9 @@ message GenerateComplete {
// Input logprobs if requested (for prompt tokens)
InputLogProbs
input_logprobs
=
10
;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32
index
=
11
;
}
message
GenerateError
{
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
963175d5
...
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x
86
\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\
"\x8c
\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbsB
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe2\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x11
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x
95
\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\
x12\r\n\x05
index
\x18\x08
\x01
(
\r\"\x9b
\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\x12\r\n\x05
index
\x18\x0b
\x01
(
\r
B
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
...
@@ -53,59 +53,59 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
1835
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
2062
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
2065
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
232
7
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
23
30
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
27
2
6
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
27
2
8
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
28
0
3
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_start
=
28
0
5
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_end
=
29
2
2
_globals
[
'_INPUTLOGPROBS'
].
_serialized_start
=
29
2
5
_globals
[
'_INPUTLOGPROBS'
].
_serialized_end
=
3
08
3
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_start
=
3
08
5
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_end
=
31
3
4
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
31
3
6
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
31
8
4
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
31
8
6
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
32
4
9
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
32
5
2
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
58
2
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
58
5
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
37
4
2
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
37
4
5
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
39
0
8
_globals
[
'_EMBEDDING'
].
_serialized_start
=
39
1
0
_globals
[
'_EMBEDDING'
].
_serialized_end
=
39
5
2
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
39
5
4
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
40
1
4
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
40
1
6
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
09
4
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
09
6
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
41
5
1
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
5
3
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
42
0
3
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
42
0
5
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
5
4
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
5
6
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
43
2
9
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
43
3
1
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
44
0
3
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
44
0
5
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
44
4
4
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
44
4
6
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
45
0
0
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
45
0
2
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
46
2
1
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
46
2
3
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
68
0
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
68
2
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
47
2
7
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
47
2
9
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
79
5
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
79
7
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
48
6
2
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
6
4
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
49
2
4
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
49
2
7
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
53
0
9
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
23
4
2
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
23
45
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
27
5
6
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
27
5
8
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
28
3
3
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_start
=
28
3
5
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_end
=
29
5
2
_globals
[
'_INPUTLOGPROBS'
].
_serialized_start
=
29
5
5
_globals
[
'_INPUTLOGPROBS'
].
_serialized_end
=
3
11
3
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_start
=
3
11
5
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_end
=
31
6
4
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
31
6
6
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
3
2
14
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
3
2
16
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
32
7
9
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
32
8
2
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
61
2
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
61
5
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
37
7
2
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
37
7
5
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
39
3
8
_globals
[
'_EMBEDDING'
].
_serialized_start
=
39
4
0
_globals
[
'_EMBEDDING'
].
_serialized_end
=
39
8
2
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
39
8
4
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
40
4
4
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
40
4
6
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
12
4
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
12
6
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
41
8
1
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
8
3
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
42
3
3
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
42
3
5
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
8
4
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
8
6
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
43
5
9
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
43
6
1
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
44
3
3
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
44
3
5
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
44
7
4
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
44
7
6
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
45
3
0
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
45
3
2
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
46
5
1
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
46
5
3
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
71
0
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
71
2
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
47
5
7
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
47
5
9
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
82
5
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
82
7
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
48
9
2
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
9
4
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
49
5
4
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
49
5
7
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
53
3
9
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
963175d5
...
...
@@ -160,7 +160,7 @@ class GenerateResponse(_message.Message):
def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ...
class GenerateStreamChunk(_message.Message):
__slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens", "output_logprobs", "hidden_states", "input_logprobs")
__slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens", "output_logprobs", "hidden_states", "input_logprobs"
, "index"
)
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
...
...
@@ -168,6 +168,7 @@ class GenerateStreamChunk(_message.Message):
OUTPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
INPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
INDEX_FIELD_NUMBER: _ClassVar[int]
token_ids: _containers.RepeatedScalarFieldContainer[int]
prompt_tokens: int
completion_tokens: int
...
...
@@ -175,10 +176,11 @@ class GenerateStreamChunk(_message.Message):
output_logprobs: OutputLogProbs
hidden_states: _containers.RepeatedScalarFieldContainer[float]
input_logprobs: InputLogProbs
def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[OutputLogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ..., input_logprobs: _Optional[_Union[InputLogProbs, _Mapping]] = ...) -> None: ...
index: int
def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[OutputLogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ..., input_logprobs: _Optional[_Union[InputLogProbs, _Mapping]] = ..., index: _Optional[int] = ...) -> None: ...
class GenerateComplete(_message.Message):
__slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "output_logprobs", "all_hidden_states", "matched_token_id", "matched_stop_str", "input_logprobs")
__slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "output_logprobs", "all_hidden_states", "matched_token_id", "matched_stop_str", "input_logprobs"
, "index"
)
OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int]
FINISH_REASON_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
...
...
@@ -189,6 +191,7 @@ class GenerateComplete(_message.Message):
MATCHED_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MATCHED_STOP_STR_FIELD_NUMBER: _ClassVar[int]
INPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
INDEX_FIELD_NUMBER: _ClassVar[int]
output_ids: _containers.RepeatedScalarFieldContainer[int]
finish_reason: str
prompt_tokens: int
...
...
@@ -199,7 +202,8 @@ class GenerateComplete(_message.Message):
matched_token_id: int
matched_stop_str: str
input_logprobs: InputLogProbs
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[OutputLogProbs, _Mapping]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ..., matched_token_id: _Optional[int] = ..., matched_stop_str: _Optional[str] = ..., input_logprobs: _Optional[_Union[InputLogProbs, _Mapping]] = ...) -> None: ...
index: int
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[OutputLogProbs, _Mapping]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ..., matched_token_id: _Optional[int] = ..., matched_stop_str: _Optional[str] = ..., input_logprobs: _Optional[_Union[InputLogProbs, _Mapping]] = ..., index: _Optional[int] = ...) -> None: ...
class GenerateError(_message.Message):
__slots__ = ("message", "http_status_code", "details")
...
...
sgl-router/benches/request_processing.rs
View file @
963175d5
...
...
@@ -192,7 +192,6 @@ fn create_large_chat_completion_request() -> ChatCompletionRequest {
content
:
Some
(
format!
(
"Answer {}: This is a detailed response about topic {} that covers multiple aspects and provides comprehensive analysis of the interconnected systems you mentioned."
,
i
,
i
)),
name
:
None
,
tool_calls
:
None
,
function_call
:
None
,
reasoning_content
:
None
,
});
}
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
963175d5
...
...
@@ -179,6 +179,9 @@ message GenerateStreamChunk {
// Input logprobs (if requested) - only in first chunk
InputLogProbs
input_logprobs
=
7
;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32
index
=
8
;
}
message
GenerateComplete
{
...
...
@@ -207,6 +210,9 @@ message GenerateComplete {
// Input logprobs if requested (for prompt tokens)
InputLogProbs
input_logprobs
=
10
;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32
index
=
11
;
}
message
GenerateError
{
...
...
sgl-router/src/protocols/spec.rs
View file @
963175d5
...
...
@@ -72,8 +72,6 @@ pub enum ChatMessage {
name
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
tool_calls
:
Option
<
Vec
<
ToolCall
>>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
function_call
:
Option
<
FunctionCallResponse
>
,
/// Reasoning content for O1-style models (SGLang extension)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
reasoning_content
:
Option
<
String
>
,
...
...
@@ -140,8 +138,6 @@ pub struct ChatMessageDelta {
pub
content
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tool_calls
:
Option
<
Vec
<
ToolCallDelta
>>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
function_call
:
Option
<
FunctionCallDelta
>
,
/// Reasoning content delta for O1-style models (SGLang extension)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
reasoning_content
:
Option
<
String
>
,
...
...
@@ -473,6 +469,8 @@ pub struct ChatStreamChoice {
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
logprobs
:
Option
<
ChatLogProbs
>
,
pub
finish_reason
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
matched_stop
:
Option
<
Value
>
,
}
// Completions API request types (v1/completions) - DEPRECATED but still supported
...
...
sgl-router/src/reasoning_parser/README.md
View file @
963175d5
...
...
@@ -44,7 +44,7 @@ graph TB
end
subgraph Factory Layer
MID --> PF[ParserFactory]
MID --> PF[
Reasoning
ParserFactory]
PF --> REG[ParserRegistry]
REG --> PM[Pattern Matching]
PM --> PP[Parser Pool]
...
...
@@ -93,7 +93,7 @@ graph TB
```
mermaid
sequenceDiagram
participant C as Client
participant F as ParserFactory
participant F as
Reasoning
ParserFactory
participant R as Registry
participant P as Parser Pool
participant BP as BaseParser
...
...
@@ -206,7 +206,7 @@ classDiagram
+new() Self
}
class ParserFactory {
class
Reasoning
ParserFactory {
-registry: ParserRegistry
+new() Self
+get_pooled(model_id: &str) PooledParser
...
...
@@ -240,7 +240,7 @@ classDiagram
Step3Parser o-- BaseReasoningParser
BaseReasoningParser o-- ParserConfig
ParserFactory o-- ParserRegistry
Reasoning
ParserFactory o-- ParserRegistry
ParserRegistry o-- ReasoningParser
```
...
...
@@ -302,7 +302,7 @@ classDiagram
-
Delegate to get_pooled_parser
-
Case-insensitive comparison
**ParserFactory Methods**
:
**
Reasoning
ParserFactory Methods**
:
1.
**`new()`**
:
-
Register all built-in parsers
...
...
@@ -437,7 +437,7 @@ impl ReasoningParser for MyModelParser {
**Step 2: Register in Factory**
```
rust
// In factory.rs ParserFactory::new()
// In factory.rs
Reasoning
ParserFactory::new()
registry
.register_parser
(
"mymodel"
,
||
{
Box
::
new
(
MyModelParser
::
new
())
});
...
...
sgl-router/src/reasoning_parser/factory.rs
View file @
963175d5
...
...
@@ -128,11 +128,11 @@ impl Default for ParserRegistry {
/// Factory for creating reasoning parsers based on model type.
#[derive(Clone)]
pub
struct
ParserFactory
{
pub
struct
Reasoning
ParserFactory
{
registry
:
ParserRegistry
,
}
impl
ParserFactory
{
impl
Reasoning
ParserFactory
{
/// Create a new factory with default parsers registered.
pub
fn
new
()
->
Self
{
let
registry
=
ParserRegistry
::
new
();
...
...
@@ -237,7 +237,7 @@ impl ParserFactory {
}
}
impl
Default
for
ParserFactory
{
impl
Default
for
Reasoning
ParserFactory
{
fn
default
()
->
Self
{
Self
::
new
()
}
...
...
@@ -249,35 +249,35 @@ mod tests {
#[test]
fn
test_factory_creates_deepseek_r1
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser
=
factory
.create
(
"deepseek-r1-distill"
)
.unwrap
();
assert_eq!
(
parser
.model_type
(),
"deepseek_r1"
);
}
#[test]
fn
test_factory_creates_qwen3
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser
=
factory
.create
(
"qwen3-7b"
)
.unwrap
();
assert_eq!
(
parser
.model_type
(),
"qwen3"
);
}
#[test]
fn
test_factory_creates_kimi
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser
=
factory
.create
(
"kimi-chat"
)
.unwrap
();
assert_eq!
(
parser
.model_type
(),
"kimi"
);
}
#[test]
fn
test_factory_fallback_to_passthrough
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser
=
factory
.create
(
"unknown-model"
)
.unwrap
();
assert_eq!
(
parser
.model_type
(),
"passthrough"
);
}
#[test]
fn
test_case_insensitive_matching
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser1
=
factory
.create
(
"DeepSeek-R1"
)
.unwrap
();
let
parser2
=
factory
.create
(
"QWEN3"
)
.unwrap
();
let
parser3
=
factory
.create
(
"Kimi"
)
.unwrap
();
...
...
@@ -289,21 +289,21 @@ mod tests {
#[test]
fn
test_step3_model
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
step3
=
factory
.create
(
"step3-model"
)
.unwrap
();
assert_eq!
(
step3
.model_type
(),
"step3"
);
}
#[test]
fn
test_glm45_model
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
glm45
=
factory
.create
(
"glm45-v2"
)
.unwrap
();
assert_eq!
(
glm45
.model_type
(),
"glm45"
);
}
#[test]
fn
test_pooled_parser_reuse
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
// Get the same parser twice - should be the same instance
let
parser1
=
factory
.get_pooled
(
"deepseek-r1"
);
...
...
@@ -321,7 +321,7 @@ mod tests {
fn
test_pooled_parser_concurrent_access
()
{
use
std
::
thread
;
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
parser
=
factory
.get_pooled
(
"deepseek-r1"
);
// Spawn multiple threads that use the same parser
...
...
@@ -347,7 +347,7 @@ mod tests {
#[test]
fn
test_pool_clearing
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
// Get a pooled parser
let
parser1
=
factory
.get_pooled
(
"deepseek-r1"
);
...
...
@@ -364,7 +364,7 @@ mod tests {
#[test]
fn
test_passthrough_parser_pooling
()
{
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
// Unknown models should get passthrough parser
let
parser1
=
factory
.get_pooled
(
"unknown-model-1"
);
...
...
@@ -383,7 +383,7 @@ mod tests {
use
std
::
thread
;
use
std
::
time
::
Instant
;
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
num_threads
=
100
;
let
requests_per_thread
=
50
;
let
models
=
vec!
[
"deepseek-r1"
,
"qwen3"
,
"kimi"
,
"qwen3-thinking"
];
...
...
@@ -527,7 +527,7 @@ mod tests {
fn
test_concurrent_pool_modifications
()
{
use
std
::
thread
;
let
factory
=
ParserFactory
::
new
();
let
factory
=
Reasoning
ParserFactory
::
new
();
let
mut
handles
=
vec!
[];
// Thread 1: Continuously get parsers
...
...
sgl-router/src/reasoning_parser/mod.rs
View file @
963175d5
...
...
@@ -2,7 +2,7 @@ pub mod factory;
pub
mod
parsers
;
pub
mod
traits
;
pub
use
factory
::{
ParserFactory
,
ParserRegistry
,
PooledParser
};
pub
use
factory
::{
ParserRegistry
,
PooledParser
,
ReasoningParserFactory
};
pub
use
parsers
::{
BaseReasoningParser
,
DeepSeekR1Parser
,
Glm45Parser
,
KimiParser
,
Qwen3Parser
,
QwenThinkingParser
,
Step3Parser
,
...
...
sgl-router/src/routers/grpc/pd_router.rs
View file @
963175d5
...
...
@@ -4,7 +4,7 @@ use crate::config::types::RetryConfig;
use
crate
::
core
::{
WorkerRegistry
,
WorkerType
};
use
crate
::
metrics
::
RouterMetrics
;
use
crate
::
policies
::
PolicyRegistry
;
use
crate
::
reasoning_parser
::
ParserFactory
;
use
crate
::
reasoning_parser
::
Reasoning
ParserFactory
;
use
crate
::
routers
::
RouterTrait
;
use
crate
::
tokenizer
::
traits
::
Tokenizer
;
use
crate
::
tool_parser
::
ToolParserFactory
;
...
...
@@ -24,7 +24,7 @@ pub struct GrpcPDRouter {
worker_registry
:
Arc
<
WorkerRegistry
>
,
policy_registry
:
Arc
<
PolicyRegistry
>
,
tokenizer
:
Arc
<
dyn
Tokenizer
>
,
reasoning_parser_factory
:
ParserFactory
,
reasoning_parser_factory
:
Reasoning
ParserFactory
,
tool_parser_factory
:
ToolParserFactory
,
dp_aware
:
bool
,
...
...
sgl-router/src/routers/grpc/router.rs
View file @
963175d5
...
...
@@ -7,10 +7,14 @@ use async_trait::async_trait;
use
axum
::{
body
::
Body
,
extract
::
Request
,
http
::{
HeaderMap
,
StatusCode
},
http
::{
header
::
CONTENT_TYPE
,
HeaderMap
,
HeaderValue
,
StatusCode
},
response
::{
IntoResponse
,
Response
},
Json
,
};
use
bytes
::
Bytes
;
use
std
::
io
;
use
tokio
::
sync
::
mpsc
;
use
tokio_stream
::
wrappers
::
UnboundedReceiverStream
;
use
tracing
::{
debug
,
error
,
info
,
warn
};
use
crate
::
config
::
types
::
RetryConfig
;
...
...
@@ -21,11 +25,12 @@ use crate::policies::PolicyRegistry;
use
crate
::
protocols
::
spec
::
ChatMessage
;
use
crate
::
protocols
::
spec
::{
ChatChoice
,
ChatCompletionMessage
,
ChatCompletionRequest
,
ChatCompletionResponse
,
CompletionRequest
,
EmbeddingRequest
,
FunctionCallResponse
,
GenerateRequest
,
RerankRequest
,
ResponsesGetParams
,
ResponsesRequest
,
StringOrArray
,
Tool
,
ToolCall
,
ToolChoice
,
ChatCompletionStreamResponse
,
ChatMessageDelta
,
ChatStreamChoice
,
CompletionRequest
,
EmbeddingRequest
,
FunctionCallDelta
,
FunctionCallResponse
,
GenerateRequest
,
RerankRequest
,
ResponsesGetParams
,
ResponsesRequest
,
StringOrArray
,
Tool
,
ToolCall
,
ToolCallDelta
,
ToolChoice
,
ToolChoiceValue
,
Usage
,
};
use
crate
::
reasoning_parser
::
ParserFactory
;
use
crate
::
reasoning_parser
::
{
ParserResult
,
Reasoning
ParserFactory
}
;
use
crate
::
routers
::
RouterTrait
;
use
crate
::
server
::
AppContext
;
use
crate
::
tokenizer
::
chat_template
::{
ChatTemplateContentFormat
,
ChatTemplateParams
};
...
...
@@ -34,7 +39,7 @@ use crate::tokenizer::stop::{
};
use
crate
::
tokenizer
::
traits
::
Tokenizer
;
use
crate
::
tokenizer
::
HuggingFaceTokenizer
;
use
crate
::
tool_parser
::
ToolParserFactory
;
use
crate
::
tool_parser
::
{
StreamingParseResult
,
ToolParserFactory
}
;
use
proto
::
generate_response
::
Response
::{
Chunk
,
Complete
,
Error
};
use
serde_json
::{
json
,
Map
,
Value
};
use
std
::
time
::{
Instant
,
SystemTime
,
UNIX_EPOCH
};
...
...
@@ -50,12 +55,13 @@ pub struct ProcessedMessages {
}
/// gRPC router implementation for SGLang
#[derive(Clone)]
#[allow(dead_code)]
pub
struct
GrpcRouter
{
worker_registry
:
Arc
<
WorkerRegistry
>
,
policy_registry
:
Arc
<
PolicyRegistry
>
,
tokenizer
:
Arc
<
dyn
Tokenizer
>
,
reasoning_parser_factory
:
ParserFactory
,
reasoning_parser_factory
:
Reasoning
ParserFactory
,
tool_parser_factory
:
ToolParserFactory
,
dp_aware
:
bool
,
api_key
:
Option
<
String
>
,
...
...
@@ -776,10 +782,11 @@ impl GrpcRouter {
}
/// Parse tool calls using model-specific parser
async
fn
parse_
with_model_parser
(
async
fn
parse_
tool_calls
(
&
self
,
processed_text
:
&
str
,
model
:
&
str
,
history_tool_calls_count
:
usize
,
)
->
(
Option
<
Vec
<
ToolCall
>>
,
String
)
{
// Get pooled parser for this model
let
pooled_parser
=
self
.tool_parser_factory
.get_pooled
(
model
);
...
...
@@ -810,16 +817,26 @@ impl GrpcRouter {
let
spec_tool_calls
=
parsed_tool_calls
.into_iter
()
.map
(|
tc
|
ToolCall
{
id
:
tc
.id
,
tool_type
:
"function"
.to_string
(),
function
:
FunctionCallResponse
{
name
:
tc
.function.name
,
arguments
:
Some
(
serde_json
::
to_string
(
&
tc
.function.arguments
)
.unwrap_or_else
(|
_
|
"{}"
.to_string
()),
),
},
.enumerate
()
.map
(|(
index
,
tc
)|
{
// Generate ID for this tool call
let
id
=
Self
::
generate_tool_call_id
(
model
,
&
tc
.function.name
,
index
,
history_tool_calls_count
,
);
ToolCall
{
id
,
tool_type
:
"function"
.to_string
(),
function
:
FunctionCallResponse
{
name
:
tc
.function.name
,
arguments
:
Some
(
serde_json
::
to_string
(
&
tc
.function.arguments
)
.unwrap_or_else
(|
_
|
"{}"
.to_string
()),
),
},
}
})
.collect
();
(
Some
(
spec_tool_calls
),
normal_text
)
...
...
@@ -920,6 +937,47 @@ impl GrpcRouter {
builder
.build
()
}
/// Count the number of tool calls in the request message history
/// This is used for KimiK2 format which needs globally unique indices
fn
get_history_tool_calls_count
(
request
:
&
ChatCompletionRequest
)
->
usize
{
request
.messages
.iter
()
.filter_map
(|
msg
|
{
if
let
ChatMessage
::
Assistant
{
tool_calls
,
..
}
=
msg
{
tool_calls
.as_ref
()
.map
(|
calls
|
calls
.len
())
}
else
{
None
}
})
.sum
()
}
/// Generate a tool call ID based on model format
///
/// # Arguments
/// * `model` - Model name to determine ID format
/// * `tool_name` - Name of the tool being called
/// * `tool_index` - Index of this tool call within the current message
/// * `history_count` - Number of tool calls in previous messages
///
/// # Returns
/// A unique ID string. KimiK2 uses `functions.{name}:{global_index}`, others use `call_{uuid}`
fn
generate_tool_call_id
(
model
:
&
str
,
tool_name
:
&
str
,
tool_index
:
usize
,
history_count
:
usize
,
)
->
String
{
if
model
.to_lowercase
()
.contains
(
"kimi"
)
{
// KimiK2 format: functions.{name}:{global_index}
format!
(
"functions.{}:{}"
,
tool_name
,
history_count
+
tool_index
)
}
else
{
// Standard OpenAI format: call_{24-char-uuid}
format!
(
"call_{}"
,
&
Uuid
::
new_v4
()
.simple
()
.to_string
()[
..
24
])
}
}
/// Process a chunk of tokens through the stop decoder
fn
process_chunk_tokens
(
stop_decoder
:
&
mut
StopSequenceDecoder
,
...
...
@@ -953,6 +1011,230 @@ impl GrpcRouter {
(
chunk_text
,
false
)
// Return text and continue processing
}
/// Helper: Process reasoning content in streaming mode
/// Returns (modified_delta, optional_reasoning_chunk)
fn
process_reasoning_stream
(
&
self
,
delta
:
&
str
,
index
:
u32
,
reasoning_parsers
:
&
mut
HashMap
<
u32
,
Arc
<
std
::
sync
::
Mutex
<
Box
<
dyn
crate
::
reasoning_parser
::
ReasoningParser
>>>
,
>
,
request_id
:
&
str
,
model
:
&
str
,
created
:
u64
,
)
->
(
String
,
Option
<
ChatCompletionStreamResponse
>
)
{
// Get or create parser for this index
reasoning_parsers
.entry
(
index
)
.or_insert_with
(||
self
.reasoning_parser_factory
.get_pooled
(
model
));
if
let
Some
(
pooled_parser
)
=
reasoning_parsers
.get
(
&
index
)
{
let
parse_result
=
{
let
mut
parser
=
pooled_parser
.lock
()
.unwrap
();
parser
.parse_reasoning_streaming_incremental
(
delta
)
};
match
parse_result
{
Ok
(
ParserResult
{
reasoning_text
,
normal_text
,
})
=>
{
let
chunk
=
if
!
reasoning_text
.is_empty
()
{
Some
(
ChatCompletionStreamResponse
{
id
:
request_id
.to_string
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
None
,
tool_calls
:
None
,
reasoning_content
:
Some
(
reasoning_text
),
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
})
}
else
{
None
};
return
(
normal_text
,
chunk
);
}
Err
(
e
)
=>
{
warn!
(
"Reasoning parsing error: {}"
,
e
);
}
}
}
(
delta
.to_string
(),
None
)
}
/// Helper: Process tool calls in streaming mode
/// Returns (should_skip_content, chunks_to_emit)
#[allow(clippy::too_many_arguments)]
async
fn
process_tool_calls_stream
(
&
self
,
delta
:
&
str
,
index
:
u32
,
tool_parsers
:
&
mut
HashMap
<
u32
,
Arc
<
tokio
::
sync
::
Mutex
<
Box
<
dyn
crate
::
tool_parser
::
ToolParser
>>>
,
>
,
has_tool_calls
:
&
mut
HashMap
<
u32
,
bool
>
,
tools
:
&
[
crate
::
protocols
::
spec
::
Tool
],
request_id
:
&
str
,
model
:
&
str
,
created
:
u64
,
history_tool_calls_count
:
usize
,
)
->
(
bool
,
Vec
<
ChatCompletionStreamResponse
>
)
{
let
mut
chunks
=
Vec
::
new
();
// Get or create parser for this index
tool_parsers
.entry
(
index
)
.or_insert_with
(||
self
.tool_parser_factory
.get_pooled
(
model
));
if
let
Some
(
pooled_parser
)
=
tool_parsers
.get
(
&
index
)
{
let
mut
parser
=
pooled_parser
.lock
()
.await
;
match
parser
.parse_incremental
(
delta
,
tools
)
.await
{
Ok
(
StreamingParseResult
{
normal_text
,
calls
})
=>
{
// Emit normal text if present
if
!
normal_text
.is_empty
()
{
chunks
.push
(
ChatCompletionStreamResponse
{
id
:
request_id
.to_string
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
Some
(
normal_text
),
tool_calls
:
None
,
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
});
}
// Emit tool call chunks
for
tool_call_item
in
calls
{
has_tool_calls
.insert
(
index
,
true
);
let
tool_call_id
=
if
let
Some
(
ref
name
)
=
tool_call_item
.name
{
Some
(
Self
::
generate_tool_call_id
(
model
,
name
,
tool_call_item
.tool_index
,
history_tool_calls_count
,
))
}
else
{
None
};
let
tool_call_delta
=
ToolCallDelta
{
index
:
tool_call_item
.tool_index
as
u32
,
id
:
tool_call_id
,
tool_type
:
if
tool_call_item
.name
.is_some
()
{
Some
(
"function"
.to_string
())
}
else
{
None
},
function
:
Some
(
FunctionCallDelta
{
name
:
tool_call_item
.name
,
arguments
:
if
!
tool_call_item
.parameters
.is_empty
()
{
Some
(
tool_call_item
.parameters
)
}
else
{
None
},
}),
};
chunks
.push
(
ChatCompletionStreamResponse
{
id
:
request_id
.to_string
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
None
,
tool_calls
:
Some
(
vec!
[
tool_call_delta
]),
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
});
}
// If we emitted chunks, skip regular content
return
(
!
chunks
.is_empty
(),
chunks
);
}
Err
(
e
)
=>
{
warn!
(
"Tool call parsing error: {}"
,
e
);
}
}
}
(
false
,
chunks
)
}
/// Helper: Create content chunk
fn
create_content_chunk
(
content
:
String
,
index
:
u32
,
request_id
:
&
str
,
model
:
&
str
,
created
:
u64
,
logprobs
:
Option
<
crate
::
protocols
::
spec
::
ChatLogProbs
>
,
)
->
ChatCompletionStreamResponse
{
ChatCompletionStreamResponse
{
id
:
request_id
.to_string
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
Some
(
content
),
tool_calls
:
None
,
reasoning_content
:
None
,
},
logprobs
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
}
}
/// Helper: Format response as SSE chunk
fn
format_sse_chunk
(
response
:
&
ChatCompletionStreamResponse
)
->
String
{
format!
(
"data: {}
\n\n
"
,
serde_json
::
to_string
(
response
)
.unwrap_or_default
()
)
}
/// Submit request and handle streaming response for chat completions route
async
fn
handle_streaming_chat
(
&
self
,
...
...
@@ -960,14 +1242,13 @@ impl GrpcRouter {
request
:
proto
::
GenerateRequest
,
original_request
:
&
ChatCompletionRequest
,
)
->
Response
{
let
mut
stop_decoder
=
self
.create_stop_decoder
(
original_request
.stop
.as_ref
(),
original_request
.stop_token_ids
.as_ref
(),
original_request
.skip_special_tokens
,
original_request
.no_stop_trim
,
);
let
request_id
=
request
.request_id
.clone
();
let
model
=
original_request
.model
.clone
();
// Create channel for SSE streaming
let
(
tx
,
rx
)
=
mpsc
::
unbounded_channel
::
<
Result
<
Bytes
,
io
::
Error
>>
();
//
Process streaming tokens
//
Start the gRPC stream
let
mut
grpc_stream
=
match
client
.generate
(
request
)
.await
{
Ok
(
stream
)
=>
stream
,
Err
(
e
)
=>
{
...
...
@@ -980,49 +1261,414 @@ impl GrpcRouter {
}
};
let
mut
decoded_text
=
String
::
new
();
let
stop_params
=
(
original_request
.stop
.clone
(),
original_request
.stop_token_ids
.clone
(),
original_request
.skip_special_tokens
,
original_request
.no_stop_trim
,
);
// Spawn processing task
let
self_clone
=
self
.clone
();
let
original_request_clone
=
original_request
.clone
();
tokio
::
spawn
(
async
move
{
let
result
=
Self
::
process_streaming_chunks
(
&
self_clone
,
&
mut
grpc_stream
,
request_id
,
model
,
stop_params
,
original_request_clone
,
&
tx
,
)
.await
;
if
let
Err
(
e
)
=
result
{
let
error_chunk
=
format!
(
"data: {}
\n\n
"
,
json!
({
"error"
:
{
"message"
:
e
,
"type"
:
"internal_error"
}
})
);
let
_
=
tx
.send
(
Ok
(
Bytes
::
from
(
error_chunk
)));
}
// Send DONE marker
let
_
=
tx
.send
(
Ok
(
Bytes
::
from
(
"data: [DONE]
\n\n
"
)));
});
// Create response with SSE headers
let
stream
=
UnboundedReceiverStream
::
new
(
rx
);
let
mut
response
=
Response
::
new
(
Body
::
from_stream
(
stream
));
*
response
.status_mut
()
=
StatusCode
::
OK
;
response
.headers_mut
()
.insert
(
CONTENT_TYPE
,
HeaderValue
::
from_static
(
"text/event-stream"
));
response
.headers_mut
()
.insert
(
"Cache-Control"
,
HeaderValue
::
from_static
(
"no-cache"
));
response
.headers_mut
()
.insert
(
"Connection"
,
HeaderValue
::
from_static
(
"keep-alive"
));
response
}
/// Process streaming chunks and send SSE events
async
fn
process_streaming_chunks
(
router
:
&
GrpcRouter
,
grpc_stream
:
&
mut
(
impl
tokio_stream
::
Stream
<
Item
=
Result
<
proto
::
GenerateResponse
,
tonic
::
Status
>>
+
Unpin
),
request_id
:
String
,
model
:
String
,
stop_params
:
(
Option
<
StringOrArray
>
,
Option
<
Vec
<
u32
>>
,
bool
,
bool
),
original_request
:
ChatCompletionRequest
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
)
->
Result
<
(),
String
>
{
// Extract request parameters
let
separate_reasoning
=
original_request
.separate_reasoning
;
let
tool_choice
=
&
original_request
.tool_choice
;
let
tools
=
&
original_request
.tools
;
let
history_tool_calls_count
=
Self
::
get_history_tool_calls_count
(
&
original_request
);
let
stream_options
=
&
original_request
.stream_options
;
// Phase 1: Initialize state tracking (per-index for n>1 support)
let
mut
is_firsts
:
HashMap
<
u32
,
bool
>
=
HashMap
::
new
();
let
mut
stream_buffers
:
HashMap
<
u32
,
String
>
=
HashMap
::
new
();
let
mut
finish_reasons
:
HashMap
<
u32
,
String
>
=
HashMap
::
new
();
let
mut
matched_stops
:
HashMap
<
u32
,
Option
<
Value
>>
=
HashMap
::
new
();
let
mut
prompt_tokens
:
HashMap
<
u32
,
u32
>
=
HashMap
::
new
();
let
mut
completion_tokens
:
HashMap
<
u32
,
u32
>
=
HashMap
::
new
();
let
mut
cached_tokens
:
HashMap
<
u32
,
u32
>
=
HashMap
::
new
();
// Parser state (lazy initialization per index)
type
PooledReasoningParser
=
Arc
<
std
::
sync
::
Mutex
<
Box
<
dyn
crate
::
reasoning_parser
::
ReasoningParser
>>>
;
let
mut
reasoning_parsers
:
HashMap
<
u32
,
PooledReasoningParser
>
=
HashMap
::
new
();
type
PooledToolParser
=
Arc
<
tokio
::
sync
::
Mutex
<
Box
<
dyn
crate
::
tool_parser
::
ToolParser
>>>
;
let
mut
tool_parsers
:
HashMap
<
u32
,
PooledToolParser
>
=
HashMap
::
new
();
let
mut
has_tool_calls
:
HashMap
<
u32
,
bool
>
=
HashMap
::
new
();
// Create stop decoder
let
(
stop
,
stop_token_ids
,
skip_special_tokens
,
no_stop_trim
)
=
stop_params
;
let
mut
stop_decoder
=
router
.create_stop_decoder
(
stop
.as_ref
(),
stop_token_ids
.as_ref
(),
skip_special_tokens
,
no_stop_trim
,
);
let
created
=
SystemTime
::
now
()
.duration_since
(
UNIX_EPOCH
)
.unwrap_or_default
()
.as_secs
();
// Phase 2: Main streaming loop
while
let
Some
(
response
)
=
grpc_stream
.next
()
.await
{
let
gen_response
=
match
response
{
Ok
(
resp
)
=>
resp
,
Err
(
e
)
=>
{
error!
(
"Stream error: {}"
,
e
);
break
;
}
};
let
gen_response
=
response
.map_err
(|
e
|
format!
(
"Stream error: {}"
,
e
))
?
;
match
gen_response
.response
{
Some
(
Chunk
(
chunk
))
=>
{
// Process tokens and check if we should stop
let
(
chunk_text
,
should_stop
)
=
let
index
=
chunk
.index
;
// Process tokens through stop decoder
let
(
chunk_text
,
_
should_stop
)
=
Self
::
process_chunk_tokens
(
&
mut
stop_decoder
,
&
chunk
.token_ids
);
decoded_text
.push_str
(
&
chunk_text
);
if
should_stop
{
break
;
if
chunk_text
.is_empty
()
{
continue
;
}
// Process logprobs if present
let
choice_logprobs
=
if
let
Some
(
ref
proto_logprobs
)
=
chunk
.output_logprobs
{
match
router
.convert_proto_to_openai_logprobs
(
proto_logprobs
)
{
Ok
(
logprobs
)
=>
Some
(
logprobs
),
Err
(
e
)
=>
{
warn!
(
"Failed to process logprobs: {}"
,
e
);
None
}
}
}
else
{
None
};
// Initialize stream buffer if first time
let
stream_buffer
=
stream_buffers
.entry
(
index
)
.or_default
();
// Send first chunk with role
if
is_firsts
.get
(
&
index
)
.copied
()
.unwrap_or
(
true
)
{
let
first_chunk
=
ChatCompletionStreamResponse
{
id
:
request_id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.clone
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
None
,
tool_calls
:
None
,
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
};
tx
.send
(
Ok
(
Bytes
::
from
(
Self
::
format_sse_chunk
(
&
first_chunk
))))
.map_err
(|
_
|
"Failed to send first chunk"
.to_string
())
?
;
is_firsts
.insert
(
index
,
false
);
}
// Calculate delta
let
mut
delta
=
chunk_text
;
stream_buffer
.push_str
(
&
delta
);
// Reasoning content handling
if
separate_reasoning
{
let
(
normal_text
,
reasoning_chunk
)
=
router
.process_reasoning_stream
(
&
delta
,
index
,
&
mut
reasoning_parsers
,
&
request_id
,
&
model
,
created
,
);
if
let
Some
(
chunk
)
=
reasoning_chunk
{
tx
.send
(
Ok
(
Bytes
::
from
(
Self
::
format_sse_chunk
(
&
chunk
))))
.map_err
(|
_
|
"Failed to send reasoning chunk"
.to_string
())
?
;
}
delta
=
normal_text
;
}
// Tool call handling
let
tool_choice_enabled
=
!
matches!
(
tool_choice
,
Some
(
ToolChoice
::
Value
(
ToolChoiceValue
::
None
)));
if
tool_choice_enabled
&&
tools
.is_some
()
{
let
(
should_skip
,
tool_chunks
)
=
router
.process_tool_calls_stream
(
&
delta
,
index
,
&
mut
tool_parsers
,
&
mut
has_tool_calls
,
tools
.as_ref
()
.unwrap
(),
&
request_id
,
&
model
,
created
,
history_tool_calls_count
,
)
.await
;
for
chunk
in
tool_chunks
{
tx
.send
(
Ok
(
Bytes
::
from
(
Self
::
format_sse_chunk
(
&
chunk
))))
.map_err
(|
_
|
"Failed to send tool call chunk"
.to_string
())
?
;
}
if
should_skip
{
continue
;
}
}
// Regular content emission
if
!
delta
.is_empty
()
{
let
content_chunk
=
Self
::
create_content_chunk
(
delta
,
index
,
&
request_id
,
&
model
,
created
,
choice_logprobs
,
);
tx
.send
(
Ok
(
Bytes
::
from
(
Self
::
format_sse_chunk
(
&
content_chunk
))))
.map_err
(|
_
|
"Failed to send content chunk"
.to_string
())
?
;
}
continue
;
}
Some
(
Complete
(
_
complete
))
=>
{
Some
(
Complete
(
complete
))
=>
{
// Flush any remaining text
if
let
SequenceDecoderOutput
::
Text
(
text
)
=
stop_decoder
.flush
()
{
if
!
text
.is_empty
()
{
decoded_text
.push_str
(
&
text
);
debug!
(
"Flushed text: {}"
,
text
);
let
index
=
complete
.index
;
let
stream_buffer
=
stream_buffers
.entry
(
index
)
.or_default
();
stream_buffer
.push_str
(
&
text
);
let
content_chunk
=
ChatCompletionStreamResponse
{
id
:
request_id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.clone
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
Some
(
text
),
tool_calls
:
None
,
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
};
let
sse_chunk
=
serde_json
::
to_string
(
&
content_chunk
)
.map_err
(|
e
|
format!
(
"Failed to serialize content chunk: {}"
,
e
))
?
;
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
sse_chunk
))))
.map_err
(|
_
|
"Failed to send flushed content"
.to_string
())
?
;
}
}
// Store metadata
let
index
=
complete
.index
;
prompt_tokens
.insert
(
index
,
complete
.prompt_tokens
as
u32
);
completion_tokens
.insert
(
index
,
complete
.completion_tokens
as
u32
);
cached_tokens
.insert
(
index
,
complete
.cached_tokens
as
u32
);
finish_reasons
.insert
(
index
,
complete
.finish_reason
.clone
());
// Extract matched_stop
let
matched_stop_value
=
match
&
complete
.matched_stop
{
Some
(
proto
::
generate_complete
::
MatchedStop
::
MatchedTokenId
(
token_id
))
=>
{
Some
(
Value
::
Number
(
serde_json
::
Number
::
from
(
*
token_id
)))
}
Some
(
proto
::
generate_complete
::
MatchedStop
::
MatchedStopStr
(
stop_str
))
=>
{
Some
(
Value
::
String
(
stop_str
.clone
()))
}
None
=>
None
,
};
matched_stops
.insert
(
index
,
matched_stop_value
);
break
;
}
Some
(
Error
(
error
))
=>
{
error!
(
"Generation error: {}"
,
error
.message
);
break
;
return
Err
(
error
.message
);
}
None
=>
continue
,
}
}
// TODO: Replace with proper SSE streaming response
// For now, return the complete decoded text
(
StatusCode
::
OK
,
format!
(
"Decoded text: {}"
,
decoded_text
))
.into_response
()
// Phase 3: Check unstreamed tool args
// Check if parsers have any remaining arguments that haven't been streamed yet
for
(
index
,
parser
)
in
&
tool_parsers
{
let
parser_guard
=
parser
.lock
()
.await
;
if
let
Some
(
unstreamed_items
)
=
parser_guard
.get_unstreamed_tool_args
()
{
for
tool_call_item
in
unstreamed_items
{
let
tool_call_delta
=
ToolCallDelta
{
index
:
tool_call_item
.tool_index
as
u32
,
id
:
None
,
tool_type
:
None
,
// No type for argument deltas
function
:
Some
(
FunctionCallDelta
{
name
:
None
,
// No name for argument deltas
arguments
:
if
!
tool_call_item
.parameters
.is_empty
()
{
Some
(
tool_call_item
.parameters
)
}
else
{
None
},
}),
};
let
tool_chunk
=
ChatCompletionStreamResponse
{
id
:
request_id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.clone
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
:
*
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
None
,
tool_calls
:
Some
(
vec!
[
tool_call_delta
]),
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
None
,
matched_stop
:
None
,
}],
usage
:
None
,
};
let
sse_chunk
=
serde_json
::
to_string
(
&
tool_chunk
)
.map_err
(|
e
|
format!
(
"Failed to serialize tool chunk: {}"
,
e
))
?
;
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
sse_chunk
))))
.map_err
(|
_
|
"Failed to send unstreamed tool args"
.to_string
())
?
;
}
}
}
// Phase 4: Finish reason chunks
for
(
index
,
finish_reason
)
in
finish_reasons
.iter
()
{
let
final_finish_reason
=
if
has_tool_calls
.get
(
index
)
.copied
()
.unwrap_or
(
false
)
&&
finish_reason
==
"stop"
{
"tool_calls"
.to_string
()
}
else
{
finish_reason
.clone
()
};
let
matched_stop_value
=
matched_stops
.get
(
index
)
.and_then
(|
v
|
v
.clone
());
let
finish_chunk
=
ChatCompletionStreamResponse
{
id
:
request_id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.clone
(),
system_fingerprint
:
None
,
choices
:
vec!
[
ChatStreamChoice
{
index
:
*
index
,
delta
:
ChatMessageDelta
{
role
:
Some
(
"assistant"
.to_string
()),
content
:
None
,
tool_calls
:
None
,
reasoning_content
:
None
,
},
logprobs
:
None
,
finish_reason
:
Some
(
final_finish_reason
),
matched_stop
:
matched_stop_value
,
}],
usage
:
None
,
};
let
sse_chunk
=
serde_json
::
to_string
(
&
finish_chunk
)
.map_err
(|
e
|
format!
(
"Failed to serialize finish chunk: {}"
,
e
))
?
;
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
sse_chunk
))))
.map_err
(|
_
|
"Failed to send finish chunk"
.to_string
())
?
;
}
// Phase 5: Usage chunk
if
let
Some
(
stream_opts
)
=
stream_options
{
if
stream_opts
.include_usage
.unwrap_or
(
false
)
{
let
total_prompt
:
u32
=
prompt_tokens
.values
()
.sum
();
let
total_completion
:
u32
=
completion_tokens
.values
()
.sum
();
let
usage_chunk
=
ChatCompletionStreamResponse
{
id
:
request_id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
,
model
:
model
.clone
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
usage
:
Some
(
Usage
{
prompt_tokens
:
total_prompt
,
completion_tokens
:
total_completion
,
total_tokens
:
total_prompt
+
total_completion
,
completion_tokens_details
:
None
,
}),
};
let
sse_chunk
=
serde_json
::
to_string
(
&
usage_chunk
)
.map_err
(|
e
|
format!
(
"Failed to serialize usage chunk: {}"
,
e
))
?
;
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
sse_chunk
))))
.map_err
(|
_
|
"Failed to send usage chunk"
.to_string
())
?
;
}
}
Ok
(())
}
/// Submit request and handle non-streaming response for chat completions route
...
...
@@ -1082,10 +1728,17 @@ impl GrpcRouter {
}
// Process each response into a ChatChoice
let
history_tool_calls_count
=
Self
::
get_history_tool_calls_count
(
original_request
);
let
mut
choices
=
Vec
::
new
();
for
(
index
,
complete
)
in
all_responses
.iter
()
.enumerate
()
{
match
self
.process_single_choice
(
complete
,
index
,
original_request
,
&
mut
stop_decoder
)
.process_single_choice
(
complete
,
index
,
original_request
,
&
mut
stop_decoder
,
history_tool_calls_count
,
)
.await
{
Ok
(
choice
)
=>
choices
.push
(
choice
),
...
...
@@ -1216,11 +1869,12 @@ impl GrpcRouter {
decoded_text
.push_str
(
&
t
);
}
let
output_ids
=
complete
.output_ids
.clone
();
let
output_ids
=
std
::
mem
::
take
(
&
mut
complete
.output_ids
);
let
finish_reason
=
std
::
mem
::
take
(
&
mut
complete
.finish_reason
);
// Build base meta_info using json! macro
let
mut
meta_info
=
json!
({
"finish_reason"
:
complete
.
finish_reason
.clone
()
,
"finish_reason"
:
finish_reason
,
"prompt_tokens"
:
complete
.prompt_tokens
,
"completion_tokens"
:
complete
.completion_tokens
,
"cached_tokens"
:
complete
.cached_tokens
,
...
...
@@ -1269,9 +1923,13 @@ impl GrpcRouter {
})
.collect
();
// Build ChatLogProbsContent for each token
for
(
i
,
&
logprob
)
in
proto_logprobs
.token_logprobs
.iter
()
.enumerate
()
{
let
token_text
=
token_texts
.get
(
i
)
.cloned
()
.unwrap_or_default
();
// Build ChatLogProbsContent for each token (consume iterator to avoid clones)
for
(
i
,
(
&
logprob
,
token_text
))
in
proto_logprobs
.token_logprobs
.iter
()
.zip
(
token_texts
.into_iter
())
.enumerate
()
{
let
bytes
=
Some
(
token_text
.as_bytes
()
.to_vec
());
// Build top_logprobs for this position
...
...
@@ -1324,6 +1982,7 @@ impl GrpcRouter {
index
:
usize
,
original_request
:
&
ChatCompletionRequest
,
stop_decoder
:
&
mut
StopSequenceDecoder
,
history_tool_calls_count
:
usize
,
)
->
Result
<
ChatChoice
,
String
>
{
stop_decoder
.reset
();
// Decode tokens
...
...
@@ -1401,7 +2060,11 @@ impl GrpcRouter {
self
.parse_json_schema_response
(
&
processed_text
,
&
original_request
.tool_choice
);
}
else
{
(
tool_calls
,
processed_text
)
=
self
.parse_with_model_parser
(
&
processed_text
,
&
original_request
.model
)
.parse_tool_calls
(
&
processed_text
,
&
original_request
.model
,
history_tool_calls_count
,
)
.await
;
}
}
...
...
@@ -1686,7 +2349,6 @@ mod tests {
content
:
Some
(
"Assistant response"
.to_string
()),
name
:
None
,
tool_calls
:
None
,
function_call
:
None
,
reasoning_content
:
None
,
}];
...
...
sgl-router/src/server.rs
View file @
963175d5
...
...
@@ -15,7 +15,7 @@ use crate::{
},
worker_spec
::{
WorkerApiResponse
,
WorkerConfigRequest
,
WorkerErrorResponse
},
},
reasoning_parser
::
ParserFactory
,
reasoning_parser
::
Reasoning
ParserFactory
,
routers
::{
router_manager
::
RouterManager
,
RouterTrait
},
service_discovery
::{
start_service_discovery
,
ServiceDiscoveryConfig
},
tokenizer
::{
factory
as
tokenizer_factory
,
traits
::
Tokenizer
},
...
...
@@ -45,7 +45,7 @@ pub struct AppContext {
pub
router_config
:
RouterConfig
,
pub
rate_limiter
:
Arc
<
TokenBucket
>
,
pub
tokenizer
:
Option
<
Arc
<
dyn
Tokenizer
>>
,
pub
reasoning_parser_factory
:
Option
<
ParserFactory
>
,
pub
reasoning_parser_factory
:
Option
<
Reasoning
ParserFactory
>
,
pub
tool_parser_factory
:
Option
<
ToolParserFactory
>
,
pub
worker_registry
:
Arc
<
WorkerRegistry
>
,
pub
policy_registry
:
Arc
<
PolicyRegistry
>
,
...
...
@@ -79,7 +79,7 @@ impl AppContext {
tokenizer_factory
::
create_tokenizer
(
&
tokenizer_path
)
.map_err
(|
e
|
format!
(
"Failed to create tokenizer: {e}"
))
?
,
);
let
reasoning_parser_factory
=
Some
(
ParserFactory
::
new
());
let
reasoning_parser_factory
=
Some
(
Reasoning
ParserFactory
::
new
());
let
tool_parser_factory
=
Some
(
ToolParserFactory
::
new
());
(
tokenizer
,
reasoning_parser_factory
,
tool_parser_factory
)
...
...
sgl-router/src/tool_parser/parsers/deepseek_parser.rs
View file @
963175d5
...
...
@@ -123,12 +123,7 @@ impl DeepSeekParser {
let
arguments
=
serde_json
::
to_string
(
&
args
)
.map_err
(|
e
|
ToolParserError
::
ParsingFailed
(
e
.to_string
()))
?
;
// Generate ID
let
id
=
format!
(
"deepseek_call_{}"
,
uuid
::
Uuid
::
new_v4
());
Ok
(
ToolCall
{
id
,
r
#
type
:
"function"
.to_string
(),
function
:
FunctionCall
{
name
:
func_name
.to_string
(),
arguments
,
...
...
@@ -320,4 +315,8 @@ impl ToolParser for DeepSeekParser {
fn
detect_format
(
&
self
,
text
:
&
str
)
->
bool
{
self
.has_tool_markers
(
text
)
}
fn
get_unstreamed_tool_args
(
&
self
)
->
Option
<
Vec
<
ToolCallItem
>>
{
helpers
::
get_unstreamed_args
(
&
self
.prev_tool_call_arr
,
&
self
.streamed_args_for_tool
)
}
}
sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs
View file @
963175d5
...
...
@@ -129,12 +129,7 @@ impl Glm4MoeParser {
let
arguments_str
=
serde_json
::
to_string
(
&
arguments
)
.map_err
(|
e
|
ToolParserError
::
ParsingFailed
(
e
.to_string
()))
?
;
// Generate ID
let
id
=
format!
(
"glm4_call_{}"
,
uuid
::
Uuid
::
new_v4
());
Ok
(
Some
(
ToolCall
{
id
,
r
#
type
:
"function"
.to_string
(),
function
:
FunctionCall
{
name
:
func_name
.to_string
(),
arguments
:
arguments_str
,
...
...
@@ -321,4 +316,8 @@ impl ToolParser for Glm4MoeParser {
fn
detect_format
(
&
self
,
text
:
&
str
)
->
bool
{
self
.has_tool_markers
(
text
)
}
fn
get_unstreamed_tool_args
(
&
self
)
->
Option
<
Vec
<
ToolCallItem
>>
{
helpers
::
get_unstreamed_args
(
&
self
.prev_tool_call_arr
,
&
self
.streamed_args_for_tool
)
}
}
sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs
View file @
963175d5
...
...
@@ -113,12 +113,7 @@ impl ToolParser for GptOssParser {
}
};
// Generate unique ID
let
id
=
format!
(
"gpt_oss_call_{}"
,
uuid
::
Uuid
::
new_v4
());
tools
.push
(
ToolCall
{
id
,
r
#
type
:
"function"
.to_string
(),
function
:
FunctionCall
{
name
:
function_name
,
arguments
,
...
...
sgl-router/src/tool_parser/parsers/helpers.rs
View file @
963175d5
...
...
@@ -14,6 +14,48 @@ pub fn get_tool_indices(tools: &[Tool]) -> HashMap<String, usize> {
.collect
()
}
/// Get unstreamed tool call arguments
/// Returns tool call items for arguments that have been parsed but not yet streamed
/// This ensures tool calls are properly completed even if the model generates final arguments in the last chunk
pub
fn
get_unstreamed_args
(
prev_tool_call_arr
:
&
[
Value
],
streamed_args_for_tool
:
&
[
String
],
)
->
Option
<
Vec
<
ToolCallItem
>>
{
// Check if we have tool calls being tracked
if
prev_tool_call_arr
.is_empty
()
||
streamed_args_for_tool
.is_empty
()
{
return
None
;
}
// Get the last tool call that was being processed
let
tool_index
=
prev_tool_call_arr
.len
()
-
1
;
if
tool_index
>=
streamed_args_for_tool
.len
()
{
return
None
;
}
// Get expected vs actual arguments
let
expected_args
=
prev_tool_call_arr
[
tool_index
]
.get
(
"arguments"
)
?
;
let
expected_str
=
serde_json
::
to_string
(
expected_args
)
.ok
()
?
;
let
actual_str
=
&
streamed_args_for_tool
[
tool_index
];
// Check if there are remaining arguments to send
let
remaining
=
if
expected_str
.starts_with
(
actual_str
)
{
&
expected_str
[
actual_str
.len
()
..
]
}
else
{
return
None
;
};
if
remaining
.is_empty
()
{
return
None
;
}
// Return the remaining arguments as a ToolCallItem
Some
(
vec!
[
ToolCallItem
{
tool_index
,
name
:
None
,
// No name for argument deltas
parameters
:
remaining
.to_string
(),
}])
}
/// Check if a buffer ends with a partial occurrence of a token
/// Returns Some(length) if there's a partial match, None otherwise
pub
fn
ends_with_partial_token
(
buffer
:
&
str
,
token
:
&
str
)
->
Option
<
usize
>
{
...
...
sgl-router/src/tool_parser/parsers/json_parser.rs
View file @
963175d5
...
...
@@ -8,7 +8,7 @@ use crate::tool_parser::{
parsers
::
helpers
,
partial_json
::
PartialJson
,
traits
::
ToolParser
,
types
::{
FunctionCall
,
StreamingParseResult
,
ToolCall
},
types
::{
FunctionCall
,
StreamingParseResult
,
ToolCall
,
ToolCallItem
},
};
/// JSON format parser for tool calls
...
...
@@ -136,16 +136,7 @@ impl JsonParser {
let
arguments
=
serde_json
::
to_string
(
args
)
.map_err
(|
e
|
ToolParserError
::
ParsingFailed
(
e
.to_string
()))
?
;
// Generate a unique ID if not provided
let
id
=
obj
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.map
(
String
::
from
)
.unwrap_or_else
(||
format!
(
"call_{}"
,
uuid
::
Uuid
::
new_v4
()));
Ok
(
Some
(
ToolCall
{
id
,
r
#
type
:
"function"
.to_string
(),
function
:
FunctionCall
{
name
:
name
.to_string
(),
arguments
,
...
...
@@ -274,4 +265,8 @@ impl ToolParser for JsonParser {
let
trimmed
=
text
.trim
();
(
trimmed
.starts_with
(
'['
)
||
trimmed
.starts_with
(
'{'
))
&&
trimmed
.contains
(
r#""name""#
)
}
fn
get_unstreamed_tool_args
(
&
self
)
->
Option
<
Vec
<
ToolCallItem
>>
{
helpers
::
get_unstreamed_args
(
&
self
.prev_tool_call_arr
,
&
self
.streamed_args_for_tool
)
}
}
sgl-router/src/tool_parser/parsers/kimik2_parser.rs
View file @
963175d5
...
...
@@ -131,12 +131,7 @@ impl ToolParser for KimiK2Parser {
// Try to parse JSON arguments
match
serde_json
::
from_str
::
<
serde_json
::
Value
>
(
function_args
)
{
Ok
(
_
)
=>
{
// Generate unique ID
let
id
=
format!
(
"kimi_call_{}"
,
uuid
::
Uuid
::
new_v4
());
tools
.push
(
ToolCall
{
id
,
r
#
type
:
"function"
.to_string
(),
function
:
FunctionCall
{
name
:
func_name
,
arguments
:
function_args
.to_string
(),
...
...
@@ -339,4 +334,8 @@ impl ToolParser for KimiK2Parser {
fn
detect_format
(
&
self
,
text
:
&
str
)
->
bool
{
self
.has_tool_markers
(
text
)
||
text
.contains
(
"<|tool_call_begin|>"
)
}
fn
get_unstreamed_tool_args
(
&
self
)
->
Option
<
Vec
<
ToolCallItem
>>
{
helpers
::
get_unstreamed_args
(
&
self
.prev_tool_call_arr
,
&
self
.streamed_args_for_tool
)
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment