Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5937a56d
Unverified
Commit
5937a56d
authored
Sep 29, 2025
by
Chang Su
Committed by
GitHub
Sep 29, 2025
Browse files
[router][grpc] Add logprobs support to router (#11082)
parent
f065e5be
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
323 additions
and
96 deletions
+323
-96
python/sglang/srt/entrypoints/grpc_request_manager.py
python/sglang/srt/entrypoints/grpc_request_manager.py
+96
-9
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+54
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+10
-8
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+52
-52
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+16
-16
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+10
-8
sgl-router/src/routers/grpc/router.rs
sgl-router/src/routers/grpc/router.rs
+85
-3
No files found.
python/sglang/srt/entrypoints/grpc_request_manager.py
View file @
5937a56d
...
@@ -82,6 +82,7 @@ class GrpcReqState:
...
@@ -82,6 +82,7 @@ class GrpcReqState:
# Streaming state
# Streaming state
stream_finished
:
bool
=
False
stream_finished
:
bool
=
False
input_logprobs_sent
:
bool
=
False
# Track if input logprobs were sent in streaming
# Token accumulation (for non-streaming)
# Token accumulation (for non-streaming)
output_ids
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
output_ids
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
...
@@ -516,19 +517,105 @@ class GrpcRequestManager:
...
@@ -516,19 +517,105 @@ class GrpcRequestManager:
},
},
}
}
# Add logprobs if available
# Accumulate input logprobs (only once, usually in first chunk)
if
batch_out
.
input_token_logprobs_val
and
i
<
len
(
batch_out
.
input_token_logprobs_val
):
if
not
state
.
input_token_logprobs_val
:
state
.
input_token_logprobs_val
.
extend
(
batch_out
.
input_token_logprobs_val
[
i
]
)
if
batch_out
.
input_token_logprobs_idx
and
i
<
len
(
batch_out
.
input_token_logprobs_idx
):
state
.
input_token_logprobs_idx
.
extend
(
batch_out
.
input_token_logprobs_idx
[
i
]
)
if
batch_out
.
input_top_logprobs_val
and
i
<
len
(
batch_out
.
input_top_logprobs_val
):
state
.
input_top_logprobs_val
.
extend
(
batch_out
.
input_top_logprobs_val
[
i
]
)
if
batch_out
.
input_top_logprobs_idx
and
i
<
len
(
batch_out
.
input_top_logprobs_idx
):
state
.
input_top_logprobs_idx
.
extend
(
batch_out
.
input_top_logprobs_idx
[
i
]
)
# Send input logprobs based on mode
if
state
.
input_token_logprobs_val
:
if
state
.
obj
.
stream
and
not
state
.
input_logprobs_sent
:
# Streaming: send input logprobs once in first chunk that has them
output_data
[
"input_logprobs"
]
=
{
"token_logprobs_val"
:
state
.
input_token_logprobs_val
,
"token_logprobs_idx"
:
state
.
input_token_logprobs_idx
,
"top_logprobs_val"
:
state
.
input_top_logprobs_val
,
"top_logprobs_idx"
:
state
.
input_top_logprobs_idx
,
}
state
.
input_logprobs_sent
=
True
elif
not
state
.
obj
.
stream
and
output_data
[
"finished"
]:
# Non-streaming: send input logprobs in final chunk
output_data
[
"input_logprobs"
]
=
{
"token_logprobs_val"
:
state
.
input_token_logprobs_val
,
"token_logprobs_idx"
:
state
.
input_token_logprobs_idx
,
"top_logprobs_val"
:
state
.
input_top_logprobs_val
,
"top_logprobs_idx"
:
state
.
input_top_logprobs_idx
,
}
# Add output logprobs if available (RAW - no detokenization!)
if
batch_out
.
output_token_logprobs_val
and
i
<
len
(
if
batch_out
.
output_token_logprobs_val
and
i
<
len
(
batch_out
.
output_token_logprobs_val
batch_out
.
output_token_logprobs_val
):
):
output_data
[
"logprobs"
]
=
{
# Accumulate in state first
"tokens"
:
batch_out
.
output_token_logprobs_val
[
i
],
state
.
output_token_logprobs_val
.
extend
(
"top_logprobs"
:
(
batch_out
.
output_token_logprobs_val
[
i
]
)
if
batch_out
.
output_token_logprobs_idx
and
i
<
len
(
batch_out
.
output_token_logprobs_idx
):
state
.
output_token_logprobs_idx
.
extend
(
batch_out
.
output_token_logprobs_idx
[
i
]
)
if
batch_out
.
output_top_logprobs_val
and
i
<
len
(
batch_out
.
output_top_logprobs_val
):
state
.
output_top_logprobs_val
.
extend
(
batch_out
.
output_top_logprobs_val
[
i
]
batch_out
.
output_top_logprobs_val
[
i
]
if
batch_out
.
output_top_logprobs_val
)
and
i
<
len
(
batch_out
.
output_top_logprobs_val
)
if
batch_out
.
output_top_logprobs_idx
and
i
<
len
(
else
None
batch_out
.
output_top_logprobs_idx
),
):
}
state
.
output_top_logprobs_idx
.
extend
(
batch_out
.
output_top_logprobs_idx
[
i
]
)
if
state
.
obj
.
stream
:
# For streaming: send incremental logprobs (only new tokens in this chunk)
# NOTE: this is different than TokenizerManager, which always accumulates
def
get_part
(
attr_name
):
source_list
=
getattr
(
batch_out
,
attr_name
,
None
)
return
(
source_list
[
i
]
if
source_list
and
i
<
len
(
source_list
)
else
[]
)
output_data
[
"output_logprobs"
]
=
{
"token_logprobs_val"
:
batch_out
.
output_token_logprobs_val
[
i
],
"token_logprobs_idx"
:
get_part
(
"output_token_logprobs_idx"
),
"top_logprobs_val"
:
get_part
(
"output_top_logprobs_val"
),
"top_logprobs_idx"
:
get_part
(
"output_top_logprobs_idx"
),
}
elif
output_data
[
"finished"
]:
# Non-streaming: send cumulative output logprobs in final chunk
output_data
[
"output_logprobs"
]
=
{
"token_logprobs_val"
:
state
.
output_token_logprobs_val
,
"token_logprobs_idx"
:
state
.
output_token_logprobs_idx
,
"top_logprobs_val"
:
state
.
output_top_logprobs_val
,
"top_logprobs_idx"
:
state
.
output_top_logprobs_idx
,
}
# Update state for accumulation
# Update state for accumulation
if
output_data
[
"token_ids"
]:
if
output_data
[
"token_ids"
]:
...
...
python/sglang/srt/entrypoints/grpc_server.py
View file @
5937a56d
...
@@ -472,11 +472,51 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -472,11 +472,51 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
ignore_eos
=
grpc_params
.
ignore_eos
,
ignore_eos
=
grpc_params
.
ignore_eos
,
)
)
def
_convert_logprobs_to_proto
(
self
,
logprobs_data
:
Dict
)
->
Optional
[
sglang_scheduler_pb2
.
LogProbs
]:
"""Convert logprobs dict to proto LogProbs format (transport RAW data only)."""
if
not
logprobs_data
:
return
None
token_logprobs_val
=
logprobs_data
.
get
(
"token_logprobs_val"
,
[])
token_logprobs_idx
=
logprobs_data
.
get
(
"token_logprobs_idx"
,
[])
top_logprobs_val
=
logprobs_data
.
get
(
"top_logprobs_val"
,
[])
top_logprobs_idx
=
logprobs_data
.
get
(
"top_logprobs_idx"
,
[])
# Build TopLogProbs entries
top_logprobs_proto
=
[]
if
top_logprobs_val
and
top_logprobs_idx
:
for
val_list
,
idx_list
in
zip
(
top_logprobs_val
,
top_logprobs_idx
):
top_logprobs_proto
.
append
(
sglang_scheduler_pb2
.
TopLogProbs
(
values
=
val_list
,
token_ids
=
idx_list
,
)
)
return
sglang_scheduler_pb2
.
LogProbs
(
token_logprobs
=
token_logprobs_val
,
token_ids
=
token_logprobs_idx
,
top_logprobs
=
top_logprobs_proto
,
)
def
_create_chunk_response
(
def
_create_chunk_response
(
self
,
request_id
:
str
,
output
:
Dict
self
,
request_id
:
str
,
output
:
Dict
)
->
sglang_scheduler_pb2
.
GenerateResponse
:
)
->
sglang_scheduler_pb2
.
GenerateResponse
:
"""Create a streaming chunk response."""
"""Create a streaming chunk response."""
meta_info
=
output
.
get
(
"meta_info"
,
{})
meta_info
=
output
.
get
(
"meta_info"
,
{})
# Convert output logprobs if present
output_logprobs_proto
=
self
.
_convert_logprobs_to_proto
(
output
.
get
(
"output_logprobs"
)
)
# Convert input logprobs if present (only in first chunk)
input_logprobs_proto
=
self
.
_convert_logprobs_to_proto
(
output
.
get
(
"input_logprobs"
)
)
return
sglang_scheduler_pb2
.
GenerateResponse
(
return
sglang_scheduler_pb2
.
GenerateResponse
(
request_id
=
request_id
,
request_id
=
request_id
,
chunk
=
sglang_scheduler_pb2
.
GenerateStreamChunk
(
chunk
=
sglang_scheduler_pb2
.
GenerateStreamChunk
(
...
@@ -484,6 +524,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -484,6 +524,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
prompt_tokens
=
meta_info
.
get
(
"prompt_tokens"
,
0
),
prompt_tokens
=
meta_info
.
get
(
"prompt_tokens"
,
0
),
completion_tokens
=
meta_info
.
get
(
"completion_tokens"
,
0
),
completion_tokens
=
meta_info
.
get
(
"completion_tokens"
,
0
),
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
output_logprobs
=
output_logprobs_proto
,
input_logprobs
=
input_logprobs_proto
,
),
),
)
)
...
@@ -519,6 +561,16 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -519,6 +561,16 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
elif
isinstance
(
matched
,
str
):
elif
isinstance
(
matched
,
str
):
matched_stop_kwargs
[
"matched_stop_str"
]
=
matched
matched_stop_kwargs
[
"matched_stop_str"
]
=
matched
# Convert output logprobs if present
output_logprobs_proto
=
self
.
_convert_logprobs_to_proto
(
output
.
get
(
"output_logprobs"
)
)
# Convert input logprobs if present
input_logprobs_proto
=
self
.
_convert_logprobs_to_proto
(
output
.
get
(
"input_logprobs"
)
)
return
sglang_scheduler_pb2
.
GenerateResponse
(
return
sglang_scheduler_pb2
.
GenerateResponse
(
request_id
=
request_id
,
request_id
=
request_id
,
complete
=
sglang_scheduler_pb2
.
GenerateComplete
(
complete
=
sglang_scheduler_pb2
.
GenerateComplete
(
...
@@ -529,6 +581,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
...
@@ -529,6 +581,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
"completion_tokens"
,
len
(
output
.
get
(
"token_ids"
,
[]))
"completion_tokens"
,
len
(
output
.
get
(
"token_ids"
,
[]))
),
),
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
cached_tokens
=
meta_info
.
get
(
"cached_tokens"
,
0
),
output_logprobs
=
output_logprobs_proto
,
input_logprobs
=
input_logprobs_proto
,
**
matched_stop_kwargs
,
**
matched_stop_kwargs
,
),
),
)
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
5937a56d
...
@@ -174,11 +174,14 @@ message GenerateStreamChunk {
...
@@ -174,11 +174,14 @@ message GenerateStreamChunk {
int32
completion_tokens
=
3
;
int32
completion_tokens
=
3
;
int32
cached_tokens
=
4
;
int32
cached_tokens
=
4
;
//
L
ogprobs (if requested)
//
Output l
ogprobs (if requested)
- incremental for streaming
LogProbs
logprobs
=
5
;
LogProbs
output_
logprobs
=
5
;
// Hidden states (if requested)
// Hidden states (if requested)
repeated
float
hidden_states
=
6
;
repeated
float
hidden_states
=
6
;
// Input logprobs (if requested) - only in first chunk
LogProbs
input_logprobs
=
7
;
}
}
message
GenerateComplete
{
message
GenerateComplete
{
...
@@ -193,8 +196,8 @@ message GenerateComplete {
...
@@ -193,8 +196,8 @@ message GenerateComplete {
int32
completion_tokens
=
4
;
int32
completion_tokens
=
4
;
int32
cached_tokens
=
5
;
int32
cached_tokens
=
5
;
//
All
logprobs if requested
//
Output
logprobs if requested
(cumulative)
repeated
LogProbs
all
_logprobs
=
6
;
LogProbs
output
_logprobs
=
6
;
// All hidden states if requested
// All hidden states if requested
repeated
HiddenStates
all_hidden_states
=
7
;
repeated
HiddenStates
all_hidden_states
=
7
;
...
@@ -204,6 +207,9 @@ message GenerateComplete {
...
@@ -204,6 +207,9 @@ message GenerateComplete {
uint32
matched_token_id
=
8
;
uint32
matched_token_id
=
8
;
string
matched_stop_str
=
9
;
string
matched_stop_str
=
9
;
}
}
// Input logprobs if requested (for prompt tokens)
LogProbs
input_logprobs
=
10
;
}
}
message
GenerateError
{
message
GenerateError
{
...
@@ -218,15 +224,11 @@ message LogProbs {
...
@@ -218,15 +224,11 @@ message LogProbs {
// Top logprobs at each position
// Top logprobs at each position
repeated
TopLogProbs
top_logprobs
=
3
;
repeated
TopLogProbs
top_logprobs
=
3
;
// Decoded text for tokens
repeated
string
token_texts
=
4
;
}
}
message
TopLogProbs
{
message
TopLogProbs
{
repeated
float
values
=
1
;
repeated
float
values
=
1
;
repeated
int32
token_ids
=
2
;
repeated
int32
token_ids
=
2
;
repeated
string
token_texts
=
3
;
}
}
message
HiddenStates
{
message
HiddenStates
{
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
5937a56d
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xf9\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x12
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x
b
b\x01\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12\x3
1
\n\x0
8
logprobs
\x18\x05
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\"\x
c5
\x0
2
\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12\x3
5
\n\x0
c\x61
ll
_logprobs
\x18\x06
\x0
3
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x
42
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
\x84\x01
\n\x08
LogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\
x12\x13\n\x0b
token_texts
\x18\x04
\x03
(
\t\"
E
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\
x12\x13\n\x0b
token_texts
\x18\x03
\x03
(
\t\
"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xf9\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x12
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x
f
b\x01\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12\x3
8
\n\x0
f
output_
logprobs
\x18\x05
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\
x12\x37\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\
"\x
81
\x0
3
\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12\x3
8
\n\x0
f
output
_logprobs
\x18\x06
\x0
1
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x
12\x37\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbsB
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
o
\n\x08
LogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\
"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
_globals
=
globals
()
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
@@ -53,55 +53,55 @@ if not _descriptor._USE_C_DESCRIPTORS:
...
@@ -53,55 +53,55 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
1858
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
1858
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
2085
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
2085
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
2088
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
2088
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
2
275
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
2
339
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
2
278
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
2
342
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
2
603
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
2
727
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
2
605
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
2
729
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
2
6
80
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
280
4
_globals
[
'_LOGPROBS'
].
_serialized_start
=
2
683
_globals
[
'_LOGPROBS'
].
_serialized_start
=
2
806
_globals
[
'_LOGPROBS'
].
_serialized_end
=
2
815
_globals
[
'_LOGPROBS'
].
_serialized_end
=
2
917
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
2
817
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
2
919
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
2
886
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
2
967
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
2
888
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
2
969
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
2951
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
3032
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
2954
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
3035
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
284
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
365
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
287
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
368
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
3
444
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
3
525
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
3
447
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
3
528
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
361
0
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
36
9
1
_globals
[
'_EMBEDDING'
].
_serialized_start
=
36
12
_globals
[
'_EMBEDDING'
].
_serialized_start
=
36
93
_globals
[
'_EMBEDDING'
].
_serialized_end
=
3
654
_globals
[
'_EMBEDDING'
].
_serialized_end
=
3
735
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
3
656
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
3
737
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
37
16
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
37
97
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
37
18
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
37
99
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
3
796
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
3
877
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
379
8
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
3
8
79
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
3
853
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
3
934
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
3
855
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
3
936
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
39
05
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
39
86
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
39
07
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
39
88
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
3956
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
4037
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
39
58
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
40
39
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
4
031
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
4
112
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
4
033
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
4
114
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
41
05
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
41
86
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
41
07
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
41
88
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
4
146
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
4
227
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
4
148
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
4
229
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
42
02
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
42
83
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
42
04
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
42
85
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
4
323
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
4
404
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
4
325
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
4
406
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
382
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
463
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
384
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
465
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
4
429
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
4
510
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
431
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
512
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
497
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
578
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
499
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
580
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
4
5
64
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
464
5
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
566
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4
647
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
626
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
707
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
4
629
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
4
710
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
50
11
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
50
92
# @@protoc_insertion_point(module_scope)
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
5937a56d
...
@@ -162,42 +162,46 @@ class GenerateResponse(_message.Message):
...
@@ -162,42 +162,46 @@ class GenerateResponse(_message.Message):
def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ...
def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ...
class GenerateStreamChunk(_message.Message):
class GenerateStreamChunk(_message.Message):
__slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens", "logprobs", "hidden_states")
__slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens", "
output_
logprobs", "hidden_states"
, "input_logprobs"
)
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
LOGPROBS_FIELD_NUMBER: _ClassVar[int]
OUTPUT_
LOGPROBS_FIELD_NUMBER: _ClassVar[int]
HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
INPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
token_ids: _containers.RepeatedScalarFieldContainer[int]
token_ids: _containers.RepeatedScalarFieldContainer[int]
prompt_tokens: int
prompt_tokens: int
completion_tokens: int
completion_tokens: int
cached_tokens: int
cached_tokens: int
logprobs: LogProbs
output_
logprobs: LogProbs
hidden_states: _containers.RepeatedScalarFieldContainer[float]
hidden_states: _containers.RepeatedScalarFieldContainer[float]
def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ...) -> None: ...
input_logprobs: LogProbs
def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ..., input_logprobs: _Optional[_Union[LogProbs, _Mapping]] = ...) -> None: ...
class GenerateComplete(_message.Message):
class GenerateComplete(_message.Message):
__slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "
all
_logprobs", "all_hidden_states", "matched_token_id", "matched_stop_str")
__slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "
output
_logprobs", "all_hidden_states", "matched_token_id", "matched_stop_str"
, "input_logprobs"
)
OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int]
OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int]
FINISH_REASON_FIELD_NUMBER: _ClassVar[int]
FINISH_REASON_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
ALL
_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
OUTPUT
_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
ALL_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
ALL_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
MATCHED_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MATCHED_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MATCHED_STOP_STR_FIELD_NUMBER: _ClassVar[int]
MATCHED_STOP_STR_FIELD_NUMBER: _ClassVar[int]
INPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
output_ids: _containers.RepeatedScalarFieldContainer[int]
output_ids: _containers.RepeatedScalarFieldContainer[int]
finish_reason: str
finish_reason: str
prompt_tokens: int
prompt_tokens: int
completion_tokens: int
completion_tokens: int
cached_tokens: int
cached_tokens: int
all
_logprobs:
_containers.RepeatedCompositeFieldContainer[
LogProbs
]
output
_logprobs: LogProbs
all_hidden_states: _containers.RepeatedCompositeFieldContainer[HiddenStates]
all_hidden_states: _containers.RepeatedCompositeFieldContainer[HiddenStates]
matched_token_id: int
matched_token_id: int
matched_stop_str: str
matched_stop_str: str
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., all_logprobs: _Optional[_Iterable[_Union[LogProbs, _Mapping]]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ..., matched_token_id: _Optional[int] = ..., matched_stop_str: _Optional[str] = ...) -> None: ...
input_logprobs: LogProbs
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., output_logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ..., matched_token_id: _Optional[int] = ..., matched_stop_str: _Optional[str] = ..., input_logprobs: _Optional[_Union[LogProbs, _Mapping]] = ...) -> None: ...
class GenerateError(_message.Message):
class GenerateError(_message.Message):
__slots__ = ("message", "http_status_code", "details")
__slots__ = ("message", "http_status_code", "details")
...
@@ -210,26 +214,22 @@ class GenerateError(_message.Message):
...
@@ -210,26 +214,22 @@ class GenerateError(_message.Message):
def __init__(self, message: _Optional[str] = ..., http_status_code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
def __init__(self, message: _Optional[str] = ..., http_status_code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
class LogProbs(_message.Message):
class LogProbs(_message.Message):
__slots__ = ("token_logprobs", "token_ids", "top_logprobs"
, "token_texts"
)
__slots__ = ("token_logprobs", "token_ids", "top_logprobs")
TOKEN_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
TOKEN_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
TOP_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
TOP_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
TOKEN_TEXTS_FIELD_NUMBER: _ClassVar[int]
token_logprobs: _containers.RepeatedScalarFieldContainer[float]
token_logprobs: _containers.RepeatedScalarFieldContainer[float]
token_ids: _containers.RepeatedScalarFieldContainer[int]
token_ids: _containers.RepeatedScalarFieldContainer[int]
top_logprobs: _containers.RepeatedCompositeFieldContainer[TopLogProbs]
top_logprobs: _containers.RepeatedCompositeFieldContainer[TopLogProbs]
token_texts: _containers.RepeatedScalarFieldContainer[str]
def __init__(self, token_logprobs: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ..., top_logprobs: _Optional[_Iterable[_Union[TopLogProbs, _Mapping]]] = ...) -> None: ...
def __init__(self, token_logprobs: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ..., top_logprobs: _Optional[_Iterable[_Union[TopLogProbs, _Mapping]]] = ..., token_texts: _Optional[_Iterable[str]] = ...) -> None: ...
class TopLogProbs(_message.Message):
class TopLogProbs(_message.Message):
__slots__ = ("values", "token_ids"
, "token_texts"
)
__slots__ = ("values", "token_ids")
VALUES_FIELD_NUMBER: _ClassVar[int]
VALUES_FIELD_NUMBER: _ClassVar[int]
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
TOKEN_TEXTS_FIELD_NUMBER: _ClassVar[int]
values: _containers.RepeatedScalarFieldContainer[float]
values: _containers.RepeatedScalarFieldContainer[float]
token_ids: _containers.RepeatedScalarFieldContainer[int]
token_ids: _containers.RepeatedScalarFieldContainer[int]
token_texts: _containers.RepeatedScalarFieldContainer[str]
def __init__(self, values: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ...) -> None: ...
def __init__(self, values: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ..., token_texts: _Optional[_Iterable[str]] = ...) -> None: ...
class HiddenStates(_message.Message):
class HiddenStates(_message.Message):
__slots__ = ("values", "layer", "position")
__slots__ = ("values", "layer", "position")
...
...
sgl-router/src/proto/sglang_scheduler.proto
View file @
5937a56d
...
@@ -174,11 +174,14 @@ message GenerateStreamChunk {
...
@@ -174,11 +174,14 @@ message GenerateStreamChunk {
int32
completion_tokens
=
3
;
int32
completion_tokens
=
3
;
int32
cached_tokens
=
4
;
int32
cached_tokens
=
4
;
//
L
ogprobs (if requested)
//
Output l
ogprobs (if requested)
- incremental for streaming
LogProbs
logprobs
=
5
;
LogProbs
output_
logprobs
=
5
;
// Hidden states (if requested)
// Hidden states (if requested)
repeated
float
hidden_states
=
6
;
repeated
float
hidden_states
=
6
;
// Input logprobs (if requested) - only in first chunk
LogProbs
input_logprobs
=
7
;
}
}
message
GenerateComplete
{
message
GenerateComplete
{
...
@@ -193,8 +196,8 @@ message GenerateComplete {
...
@@ -193,8 +196,8 @@ message GenerateComplete {
int32
completion_tokens
=
4
;
int32
completion_tokens
=
4
;
int32
cached_tokens
=
5
;
int32
cached_tokens
=
5
;
//
All
logprobs if requested
//
Output
logprobs if requested
(cumulative)
repeated
LogProbs
all
_logprobs
=
6
;
LogProbs
output
_logprobs
=
6
;
// All hidden states if requested
// All hidden states if requested
repeated
HiddenStates
all_hidden_states
=
7
;
repeated
HiddenStates
all_hidden_states
=
7
;
...
@@ -204,6 +207,9 @@ message GenerateComplete {
...
@@ -204,6 +207,9 @@ message GenerateComplete {
uint32
matched_token_id
=
8
;
uint32
matched_token_id
=
8
;
string
matched_stop_str
=
9
;
string
matched_stop_str
=
9
;
}
}
// Input logprobs if requested (for prompt tokens)
LogProbs
input_logprobs
=
10
;
}
}
message
GenerateError
{
message
GenerateError
{
...
@@ -218,15 +224,11 @@ message LogProbs {
...
@@ -218,15 +224,11 @@ message LogProbs {
// Top logprobs at each position
// Top logprobs at each position
repeated
TopLogProbs
top_logprobs
=
3
;
repeated
TopLogProbs
top_logprobs
=
3
;
// Decoded text for tokens
repeated
string
token_texts
=
4
;
}
}
message
TopLogProbs
{
message
TopLogProbs
{
repeated
float
values
=
1
;
repeated
float
values
=
1
;
repeated
int32
token_ids
=
2
;
repeated
int32
token_ids
=
2
;
repeated
string
token_texts
=
3
;
}
}
message
HiddenStates
{
message
HiddenStates
{
...
...
sgl-router/src/routers/grpc/router.rs
View file @
5937a56d
...
@@ -730,6 +730,73 @@ impl GrpcRouter {
...
@@ -730,6 +730,73 @@ impl GrpcRouter {
Json
(
response
)
.into_response
()
Json
(
response
)
.into_response
()
}
}
/// Convert proto LogProbs to OpenAI ChatLogProbs format
/// Note: Always decodes with skip_special_tokens=false to show actual tokens generated
fn
convert_proto_to_openai_logprobs
(
&
self
,
proto_logprobs
:
&
proto
::
LogProbs
,
)
->
Result
<
crate
::
protocols
::
spec
::
ChatLogProbs
,
String
>
{
let
mut
content_items
=
Vec
::
new
();
// Decode token IDs to text (always with skip_special_tokens=false for logprobs)
let
token_texts
:
Vec
<
String
>
=
proto_logprobs
.token_ids
.iter
()
.map
(|
&
token_id
|
{
self
.tokenizer
.decode
(
&
[
token_id
as
u32
],
false
)
.unwrap_or_else
(|
_
|
format!
(
"<token_{}>"
,
token_id
))
})
.collect
();
// Build ChatLogProbsContent for each token
for
(
i
,
&
logprob
)
in
proto_logprobs
.token_logprobs
.iter
()
.enumerate
()
{
let
token_text
=
token_texts
.get
(
i
)
.cloned
()
.unwrap_or_default
();
let
bytes
=
Some
(
token_text
.as_bytes
()
.to_vec
());
// Build top_logprobs for this position
let
mut
top_logprobs
=
Vec
::
new
();
if
let
Some
(
top_logprobs_entry
)
=
proto_logprobs
.top_logprobs
.get
(
i
)
{
// Decode top token IDs (always with skip_special_tokens=false)
let
top_token_texts
:
Vec
<
String
>
=
top_logprobs_entry
.token_ids
.iter
()
.map
(|
&
tid
|
{
self
.tokenizer
.decode
(
&
[
tid
as
u32
],
false
)
.unwrap_or_else
(|
_
|
format!
(
"<token_{}>"
,
tid
))
})
.collect
();
for
(
j
,
(
&
top_logprob
,
&
_
top_token_id
))
in
top_logprobs_entry
.values
.iter
()
.zip
(
top_logprobs_entry
.token_ids
.iter
())
.enumerate
()
{
if
let
Some
(
top_token_text
)
=
top_token_texts
.get
(
j
)
{
top_logprobs
.push
(
crate
::
protocols
::
spec
::
TopLogProb
{
token
:
top_token_text
.clone
(),
logprob
:
top_logprob
,
bytes
:
Some
(
top_token_text
.as_bytes
()
.to_vec
()),
});
}
}
}
content_items
.push
(
crate
::
protocols
::
spec
::
ChatLogProbsContent
{
token
:
token_text
,
logprob
,
bytes
,
top_logprobs
,
});
}
Ok
(
crate
::
protocols
::
spec
::
ChatLogProbs
::
Detailed
{
content
:
(
!
content_items
.is_empty
())
.then_some
(
content_items
),
})
}
/// Process a single GenerateComplete response into a ChatChoice
/// Process a single GenerateComplete response into a ChatChoice
async
fn
process_single_choice
(
async
fn
process_single_choice
(
&
self
,
&
self
,
...
@@ -855,7 +922,22 @@ impl GrpcRouter {
...
@@ -855,7 +922,22 @@ impl GrpcRouter {
None
=>
None
,
None
=>
None
,
};
};
// Step 4: Build ChatCompletionMessage (proper response message type)
// Step 4: Convert output logprobs if present
// Note: complete.input_logprobs exists in proto but is not used for chat completions
// (input logprobs are only used in /v1/completions endpoint with echo=true)
let
logprobs
=
if
let
Some
(
proto_logprobs
)
=
&
complete
.output_logprobs
{
match
self
.convert_proto_to_openai_logprobs
(
proto_logprobs
)
{
Ok
(
logprobs
)
=>
Some
(
logprobs
),
Err
(
e
)
=>
{
error!
(
"Failed to convert logprobs: {}"
,
e
);
None
}
}
}
else
{
None
};
// Step 5: Build ChatCompletionMessage (proper response message type)
let
chat_message
=
ChatCompletionMessage
{
let
chat_message
=
ChatCompletionMessage
{
role
:
"assistant"
.to_string
(),
role
:
"assistant"
.to_string
(),
content
:
if
processed_text
.is_empty
()
{
content
:
if
processed_text
.is_empty
()
{
...
@@ -867,11 +949,11 @@ impl GrpcRouter {
...
@@ -867,11 +949,11 @@ impl GrpcRouter {
reasoning_content
:
reasoning_text
,
reasoning_content
:
reasoning_text
,
};
};
// Step
5
: Build ChatChoice
// Step
6
: Build ChatChoice
let
choice
=
ChatChoice
{
let
choice
=
ChatChoice
{
index
:
index
as
u32
,
index
:
index
as
u32
,
message
:
chat_message
,
message
:
chat_message
,
logprobs
:
None
,
logprobs
,
finish_reason
:
Some
(
final_finish_reason_str
.to_string
()),
finish_reason
:
Some
(
final_finish_reason_str
.to_string
()),
matched_stop
,
matched_stop
,
hidden_states
:
None
,
hidden_states
:
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment