Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
7ff740a6
Unverified
Commit
7ff740a6
authored
Oct 03, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 03, 2025
Browse files
Remove dp balance metadata and minimul token balance. (#11170)
parent
bfcd9b24
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
77 additions
and
316 deletions
+77
-316
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+0
-1
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+1
-4
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+64
-64
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+2
-4
python/sglang/srt/managers/data_parallel_controller.py
python/sglang/srt/managers/data_parallel_controller.py
+8
-45
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+0
-5
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+1
-8
python/sglang/srt/managers/scheduler_metrics_mixin.py
python/sglang/srt/managers/scheduler_metrics_mixin.py
+0
-96
python/sglang/srt/managers/utils.py
python/sglang/srt/managers/utils.py
+0
-43
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+1
-4
test/srt/test_dp_attention.py
test/srt/test_dp_attention.py
+0
-42
No files found.
python/sglang/srt/entrypoints/engine.py
View file @
7ff740a6
...
@@ -812,7 +812,6 @@ def _launch_subprocesses(
...
@@ -812,7 +812,6 @@ def _launch_subprocesses(
pp_rank
,
pp_rank
,
None
,
None
,
writer
,
writer
,
None
,
),
),
)
)
...
...
python/sglang/srt/grpc/sglang_scheduler.proto
View file @
7ff740a6
...
@@ -120,11 +120,8 @@ message GenerateRequest {
...
@@ -120,11 +120,8 @@ message GenerateRequest {
// Data parallel routing
// Data parallel routing
int32
data_parallel_rank
=
16
;
int32
data_parallel_rank
=
16
;
// For load balancing
int32
dp_balance_id
=
17
;
// Whether client wants streaming response
// Whether client wants streaming response
bool
stream
=
1
8
;
bool
stream
=
1
7
;
}
}
message
TokenizedInput
{
message
TokenizedInput
{
...
...
python/sglang/srt/grpc/sglang_scheduler_pb2.py
View file @
7ff740a6
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
...
@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\x
f9
\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x1
2
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x86\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\"\x8c\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbsB
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xe1\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x1b\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05
H
\x01\x88\x01\x01\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\r\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x18\n\x0e
structural_tag
\x18\x10
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x11
\x01
(
\t\x12\t\n\x01
n
\x18\x12
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x13
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x14
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x15
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x16
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x17
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x18
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraintB
\x11\n\x0f
_max_new_tokens
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\x
e2
\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\r\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x0e\n\x06
stream
\x18\x1
1
\x01
(
\x08\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\r\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\x86\x02\n\x13
GenerateStreamChunk
\x12\x11\n\t
token_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x04
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12\x15\n\r
hidden_states
\x18\x06
\x03
(
\x02\x12
<
\n\x0e
input_logprobs
\x18\x07
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbs
\"\x8c\x03\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\r\x12\x15\n\r
finish_reason
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12
>
\n\x0f
output_logprobs
\x18\x06
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.OutputLogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x07
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\x12\x1a\n\x10
matched_token_id
\x18\x08
\x01
(
\r
H
\x00\x12\x1a\n\x10
matched_stop_str
\x18\t
\x01
(
\t
H
\x00\x12
<
\n\x0e
input_logprobs
\x18\n
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.InputLogProbsB
\x0e\n\x0c
matched_stop
\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
u
\n\x0e
OutputLogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"\x9e\x01\n\r
InputLogProbs
\x12
@
\n\x0e
token_logprobs
\x18\x01
\x03
(
\x0b\x32
(.sglang.grpc.scheduler.InputTokenLogProb
\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\"
1
\n\x11
InputTokenLogProb
\x12\x12\n\x05
value
\x18\x01
\x01
(
\x02
H
\x00\x88\x01\x01\x42\x08\n\x06
_value
\"
0
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xa3\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x05
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
_globals
=
globals
()
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
...
@@ -45,67 +45,67 @@ if not _descriptor._USE_C_DESCRIPTORS:
...
@@ -45,67 +45,67 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_start
=
852
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_start
=
852
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_end
=
945
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_end
=
945
_globals
[
'_GENERATEREQUEST'
].
_serialized_start
=
948
_globals
[
'_GENERATEREQUEST'
].
_serialized_start
=
948
_globals
[
'_GENERATEREQUEST'
].
_serialized_end
=
158
1
_globals
[
'_GENERATEREQUEST'
].
_serialized_end
=
15
5
8
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_start
=
15
83
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_start
=
15
60
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_end
=
16
4
1
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_end
=
161
8
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_start
=
16
44
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_start
=
16
21
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_end
=
18
55
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_end
=
18
32
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
185
8
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
18
3
5
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
20
85
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
20
62
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
20
88
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
20
65
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
23
50
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
23
27
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
23
5
3
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
233
0
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
27
49
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
27
26
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
27
51
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
27
28
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
28
26
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
28
03
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_start
=
28
28
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_start
=
28
05
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_end
=
29
45
_globals
[
'_OUTPUTLOGPROBS'
].
_serialized_end
=
29
22
_globals
[
'_INPUTLOGPROBS'
].
_serialized_start
=
29
48
_globals
[
'_INPUTLOGPROBS'
].
_serialized_start
=
29
25
_globals
[
'_INPUTLOGPROBS'
].
_serialized_end
=
3
106
_globals
[
'_INPUTLOGPROBS'
].
_serialized_end
=
3
083
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_start
=
3
1
08
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_start
=
308
5
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_end
=
31
57
_globals
[
'_INPUTTOKENLOGPROB'
].
_serialized_end
=
31
34
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
31
59
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
31
36
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
3
207
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
3
184
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
3
209
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
3
186
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
32
72
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
32
49
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
32
7
5
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
325
2
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
605
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3
582
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
608
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3
585
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
37
65
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
37
42
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
37
68
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
37
45
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
39
31
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
39
08
_globals
[
'_EMBEDDING'
].
_serialized_start
=
39
33
_globals
[
'_EMBEDDING'
].
_serialized_start
=
39
10
_globals
[
'_EMBEDDING'
].
_serialized_end
=
39
7
5
_globals
[
'_EMBEDDING'
].
_serialized_end
=
395
2
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
39
77
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
39
54
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
40
37
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
40
14
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
40
39
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
40
16
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
117
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
4
094
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
119
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
4
096
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
41
74
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
41
51
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
76
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
41
53
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
42
26
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
42
03
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
42
28
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
42
05
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
77
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
42
54
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
79
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
42
56
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
43
5
2
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
432
9
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
43
54
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
43
31
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
44
26
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
44
03
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
44
28
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
44
05
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
44
67
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
44
44
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
446
9
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
44
4
6
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
45
23
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
45
00
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
452
5
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
45
0
2
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
46
44
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
46
21
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
46
46
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
46
23
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
703
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4
680
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
705
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
682
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
47
50
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
47
27
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
47
5
2
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
472
9
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
818
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4
795
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
820
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4
797
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
48
85
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
48
62
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
87
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
48
64
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
494
7
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
49
2
4
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
49
50
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
49
27
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
53
32
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
53
09
# @@protoc_insertion_point(module_scope)
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
View file @
7ff740a6
...
@@ -82,7 +82,7 @@ class DisaggregatedParams(_message.Message):
...
@@ -82,7 +82,7 @@ class DisaggregatedParams(_message.Message):
def __init__(self, bootstrap_host: _Optional[str] = ..., bootstrap_port: _Optional[int] = ..., bootstrap_room: _Optional[int] = ...) -> None: ...
def __init__(self, bootstrap_host: _Optional[str] = ..., bootstrap_port: _Optional[int] = ..., bootstrap_room: _Optional[int] = ...) -> None: ...
class GenerateRequest(_message.Message):
class GenerateRequest(_message.Message):
__slots__ = ("request_id", "tokenized", "mm_inputs", "sampling_params", "return_logprob", "logprob_start_len", "top_logprobs_num", "token_ids_logprob", "return_hidden_states", "disaggregated_params", "custom_logit_processor", "timestamp", "log_metrics", "input_embeds", "lora_id", "data_parallel_rank",
"dp_balance_id",
"stream")
__slots__ = ("request_id", "tokenized", "mm_inputs", "sampling_params", "return_logprob", "logprob_start_len", "top_logprobs_num", "token_ids_logprob", "return_hidden_states", "disaggregated_params", "custom_logit_processor", "timestamp", "log_metrics", "input_embeds", "lora_id", "data_parallel_rank", "stream")
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
TOKENIZED_FIELD_NUMBER: _ClassVar[int]
TOKENIZED_FIELD_NUMBER: _ClassVar[int]
MM_INPUTS_FIELD_NUMBER: _ClassVar[int]
MM_INPUTS_FIELD_NUMBER: _ClassVar[int]
...
@@ -99,7 +99,6 @@ class GenerateRequest(_message.Message):
...
@@ -99,7 +99,6 @@ class GenerateRequest(_message.Message):
INPUT_EMBEDS_FIELD_NUMBER: _ClassVar[int]
INPUT_EMBEDS_FIELD_NUMBER: _ClassVar[int]
LORA_ID_FIELD_NUMBER: _ClassVar[int]
LORA_ID_FIELD_NUMBER: _ClassVar[int]
DATA_PARALLEL_RANK_FIELD_NUMBER: _ClassVar[int]
DATA_PARALLEL_RANK_FIELD_NUMBER: _ClassVar[int]
DP_BALANCE_ID_FIELD_NUMBER: _ClassVar[int]
STREAM_FIELD_NUMBER: _ClassVar[int]
STREAM_FIELD_NUMBER: _ClassVar[int]
request_id: str
request_id: str
tokenized: TokenizedInput
tokenized: TokenizedInput
...
@@ -117,9 +116,8 @@ class GenerateRequest(_message.Message):
...
@@ -117,9 +116,8 @@ class GenerateRequest(_message.Message):
input_embeds: _containers.RepeatedScalarFieldContainer[float]
input_embeds: _containers.RepeatedScalarFieldContainer[float]
lora_id: str
lora_id: str
data_parallel_rank: int
data_parallel_rank: int
dp_balance_id: int
stream: bool
stream: bool
def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., mm_inputs: _Optional[_Union[MultimodalInputs, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., return_logprob: bool = ..., logprob_start_len: _Optional[int] = ..., top_logprobs_num: _Optional[int] = ..., token_ids_logprob: _Optional[_Iterable[int]] = ..., return_hidden_states: bool = ..., disaggregated_params: _Optional[_Union[DisaggregatedParams, _Mapping]] = ..., custom_logit_processor: _Optional[str] = ..., timestamp: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ..., log_metrics: bool = ..., input_embeds: _Optional[_Iterable[float]] = ..., lora_id: _Optional[str] = ..., data_parallel_rank: _Optional[int] = ...,
dp_balance_id: _Optional[int] = ...,
stream: bool = ...) -> None: ...
def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., mm_inputs: _Optional[_Union[MultimodalInputs, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., return_logprob: bool = ..., logprob_start_len: _Optional[int] = ..., top_logprobs_num: _Optional[int] = ..., token_ids_logprob: _Optional[_Iterable[int]] = ..., return_hidden_states: bool = ..., disaggregated_params: _Optional[_Union[DisaggregatedParams, _Mapping]] = ..., custom_logit_processor: _Optional[str] = ..., timestamp: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ..., log_metrics: bool = ..., input_embeds: _Optional[_Iterable[float]] = ..., lora_id: _Optional[str] = ..., data_parallel_rank: _Optional[int] = ..., stream: bool = ...) -> None: ...
class TokenizedInput(_message.Message):
class TokenizedInput(_message.Message):
__slots__ = ("original_text", "input_ids")
__slots__ = ("original_text", "input_ids")
...
...
python/sglang/srt/managers/data_parallel_controller.py
View file @
7ff740a6
...
@@ -17,14 +17,11 @@ import faulthandler
...
@@ -17,14 +17,11 @@ import faulthandler
import
logging
import
logging
import
multiprocessing
as
mp
import
multiprocessing
as
mp
import
signal
import
signal
import
struct
import
sys
import
threading
import
threading
import
time
import
time
from
collections
import
deque
from
collections
import
deque
from
enum
import
Enum
,
auto
from
enum
import
Enum
,
auto
from
multiprocessing
import
shared_memory
from
typing
import
List
from
typing
import
Dict
,
List
import
psutil
import
psutil
import
setproctitle
import
setproctitle
...
@@ -39,7 +36,6 @@ from sglang.srt.managers.io_struct import (
...
@@ -39,7 +36,6 @@ from sglang.srt.managers.io_struct import (
)
)
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.schedule_batch
import
Req
from
sglang.srt.managers.scheduler
import
run_scheduler_process
from
sglang.srt.managers.scheduler
import
run_scheduler_process
from
sglang.srt.managers.utils
import
DPBalanceMeta
from
sglang.srt.server_args
import
PortArgs
,
ServerArgs
from
sglang.srt.server_args
import
PortArgs
,
ServerArgs
from
sglang.srt.torch_memory_saver_adapter
import
TorchMemorySaverAdapter
from
sglang.srt.torch_memory_saver_adapter
import
TorchMemorySaverAdapter
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
...
@@ -108,15 +104,9 @@ class DPBudget:
...
@@ -108,15 +104,9 @@ class DPBudget:
class
DataParallelController
:
class
DataParallelController
:
"""A controller that dispatches requests to multiple data parallel workers."""
"""A controller that dispatches requests to multiple data parallel workers."""
def
__init__
(
def
__init__
(
self
,
server_args
:
ServerArgs
,
port_args
:
PortArgs
)
->
None
:
self
,
server_args
:
ServerArgs
,
port_args
:
PortArgs
,
dp_balance_meta
:
DPBalanceMeta
,
)
->
None
:
# for dp balance
# for dp balance
self
.
global_balance_id
=
0
self
.
global_balance_id
=
0
self
.
balance_meta
=
dp_balance_meta
# Parse args
# Parse args
self
.
max_total_num_tokens
=
None
self
.
max_total_num_tokens
=
None
...
@@ -322,7 +312,6 @@ class DataParallelController:
...
@@ -322,7 +312,6 @@ class DataParallelController:
pp_rank
,
pp_rank
,
dp_rank
,
dp_rank
,
writer
,
writer
,
self
.
balance_meta
,
),
),
)
)
with
memory_saver_adapter
.
configure_subprocess
():
with
memory_saver_adapter
.
configure_subprocess
():
...
@@ -370,31 +359,11 @@ class DataParallelController:
...
@@ -370,31 +359,11 @@ class DataParallelController:
if
self
.
maybe_external_dp_rank_routing
(
req
):
if
self
.
maybe_external_dp_rank_routing
(
req
):
return
return
# This variable corresponds to the balance_id in TokenizedGenerateReqInput.
logger
.
warning
(
# We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
"The 'minimum_tokens' load balancing method is deprecated for now and will introduced later."
def
get_next_global_balance_id
()
->
int
:
"Fall back to 'round_robin_scheduler'"
INT32_MAX
=
2147483647
)
current_id
=
self
.
global_balance_id
self
.
round_robin_scheduler
(
req
)
self
.
global_balance_id
=
(
self
.
global_balance_id
+
1
)
%
INT32_MAX
return
current_id
req
.
dp_balance_id
=
get_next_global_balance_id
()
with
self
.
balance_meta
.
mutex
:
# 1. local_tokens represents the tokens currently inferring on the worker,
# while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
onfly_info
=
self
.
balance_meta
.
get_shared_onfly
()
local_tokens
=
self
.
balance_meta
.
get_shared_local_tokens
()
total_tokens
=
[
local_token
+
sum
(
onfly_dict
.
values
())
for
local_token
,
onfly_dict
in
zip
(
local_tokens
,
onfly_info
)
]
target_worker
=
total_tokens
.
index
(
min
(
total_tokens
))
onfly_info
[
target_worker
][
req
.
dp_balance_id
]
=
len
(
req
.
input_ids
)
# 2. write the new onfly info to the shm
self
.
balance_meta
.
set_shared_onfly_info
(
onfly_info
)
# logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
self
.
workers
[
target_worker
].
send_pyobj
(
req
)
def
event_loop
(
self
):
def
event_loop
(
self
):
while
True
:
while
True
:
...
@@ -416,12 +385,9 @@ def run_data_parallel_controller_process(
...
@@ -416,12 +385,9 @@ def run_data_parallel_controller_process(
faulthandler
.
enable
()
faulthandler
.
enable
()
configure_logger
(
server_args
)
configure_logger
(
server_args
)
parent_process
=
psutil
.
Process
().
parent
()
parent_process
=
psutil
.
Process
().
parent
()
balance_meta
=
DPBalanceMeta
(
server_args
.
dp_size
)
try
:
try
:
controller
=
DataParallelController
(
controller
=
DataParallelController
(
server_args
,
port_args
)
server_args
,
port_args
,
dp_balance_meta
=
balance_meta
)
pipe_writer
.
send
(
pipe_writer
.
send
(
{
{
"status"
:
"ready"
,
"status"
:
"ready"
,
...
@@ -440,6 +406,3 @@ def run_data_parallel_controller_process(
...
@@ -440,6 +406,3 @@ def run_data_parallel_controller_process(
traceback
=
get_exception_traceback
()
traceback
=
get_exception_traceback
()
logger
.
error
(
f
"DataParallelController hit an exception:
{
traceback
}
"
)
logger
.
error
(
f
"DataParallelController hit an exception:
{
traceback
}
"
)
parent_process
.
send_signal
(
signal
.
SIGQUIT
)
parent_process
.
send_signal
(
signal
.
SIGQUIT
)
finally
:
# we need to destruct mp.Manager() in balance_meta
balance_meta
.
destructor
()
python/sglang/srt/managers/io_struct.py
View file @
7ff740a6
...
@@ -606,9 +606,6 @@ class TokenizedGenerateReqInput:
...
@@ -606,9 +606,6 @@ class TokenizedGenerateReqInput:
# For data parallel rank routing
# For data parallel rank routing
data_parallel_rank
:
Optional
[
int
]
=
None
data_parallel_rank
:
Optional
[
int
]
=
None
# For dp balance
dp_balance_id
:
int
=
-
1
# Priority for the request
# Priority for the request
priority
:
Optional
[
int
]
=
None
priority
:
Optional
[
int
]
=
None
...
@@ -778,8 +775,6 @@ class TokenizedEmbeddingReqInput:
...
@@ -778,8 +775,6 @@ class TokenizedEmbeddingReqInput:
sampling_params
:
SamplingParams
sampling_params
:
SamplingParams
# For data parallel rank routing
# For data parallel rank routing
data_parallel_rank
:
Optional
[
int
]
=
None
data_parallel_rank
:
Optional
[
int
]
=
None
# For dp balance
dp_balance_id
:
int
=
-
1
# Priority for the request
# Priority for the request
priority
:
Optional
[
int
]
=
None
priority
:
Optional
[
int
]
=
None
...
...
python/sglang/srt/managers/scheduler.py
View file @
7ff740a6
...
@@ -145,7 +145,7 @@ from sglang.srt.managers.scheduler_update_weights_mixin import (
...
@@ -145,7 +145,7 @@ from sglang.srt.managers.scheduler_update_weights_mixin import (
from
sglang.srt.managers.session_controller
import
Session
from
sglang.srt.managers.session_controller
import
Session
from
sglang.srt.managers.tp_worker
import
TpModelWorker
from
sglang.srt.managers.tp_worker
import
TpModelWorker
from
sglang.srt.managers.tp_worker_overlap_thread
import
TpModelWorkerClient
from
sglang.srt.managers.tp_worker_overlap_thread
import
TpModelWorkerClient
from
sglang.srt.managers.utils
import
DPBalanceMeta
,
validate_input_length
from
sglang.srt.managers.utils
import
validate_input_length
from
sglang.srt.mem_cache.chunk_cache
import
ChunkCache
,
SWAChunkCache
from
sglang.srt.mem_cache.chunk_cache
import
ChunkCache
,
SWAChunkCache
from
sglang.srt.mem_cache.hiradix_cache
import
HiRadixCache
from
sglang.srt.mem_cache.hiradix_cache
import
HiRadixCache
from
sglang.srt.mem_cache.radix_cache
import
RadixCache
from
sglang.srt.mem_cache.radix_cache
import
RadixCache
...
@@ -271,7 +271,6 @@ class Scheduler(
...
@@ -271,7 +271,6 @@ class Scheduler(
moe_ep_rank
:
int
,
moe_ep_rank
:
int
,
pp_rank
:
int
,
pp_rank
:
int
,
dp_rank
:
Optional
[
int
],
dp_rank
:
Optional
[
int
],
dp_balance_meta
:
Optional
[
DPBalanceMeta
]
=
None
,
):
):
# Parse args
# Parse args
self
.
server_args
=
server_args
self
.
server_args
=
server_args
...
@@ -600,7 +599,6 @@ class Scheduler(
...
@@ -600,7 +599,6 @@ class Scheduler(
# Init metrics stats
# Init metrics stats
self
.
init_metrics
(
tp_rank
,
pp_rank
,
dp_rank
)
self
.
init_metrics
(
tp_rank
,
pp_rank
,
dp_rank
)
self
.
init_dp_balance
(
dp_balance_meta
)
if
self
.
enable_kv_cache_events
:
if
self
.
enable_kv_cache_events
:
self
.
init_kv_events
(
server_args
.
kv_events_config
)
self
.
init_kv_events
(
server_args
.
kv_events_config
)
...
@@ -1270,8 +1268,6 @@ class Scheduler(
...
@@ -1270,8 +1268,6 @@ class Scheduler(
self
,
self
,
recv_req
:
TokenizedGenerateReqInput
,
recv_req
:
TokenizedGenerateReqInput
,
):
):
self
.
maybe_update_dp_balance_data
(
recv_req
)
# Create a new request
# Create a new request
if
(
if
(
recv_req
.
session_params
is
None
recv_req
.
session_params
is
None
...
@@ -1797,7 +1793,6 @@ class Scheduler(
...
@@ -1797,7 +1793,6 @@ class Scheduler(
# Handle DP attention
# Handle DP attention
if
need_dp_attn_preparation
:
if
need_dp_attn_preparation
:
self
.
maybe_handle_dp_balance_data
()
ret
=
self
.
prepare_mlp_sync_batch
(
ret
)
ret
=
self
.
prepare_mlp_sync_batch
(
ret
)
return
ret
return
ret
...
@@ -2803,7 +2798,6 @@ def run_scheduler_process(
...
@@ -2803,7 +2798,6 @@ def run_scheduler_process(
pp_rank
:
int
,
pp_rank
:
int
,
dp_rank
:
Optional
[
int
],
dp_rank
:
Optional
[
int
],
pipe_writer
,
pipe_writer
,
balance_meta
:
Optional
[
DPBalanceMeta
]
=
None
,
):
):
# Generate the logger prefix
# Generate the logger prefix
prefix
=
""
prefix
=
""
...
@@ -2852,7 +2846,6 @@ def run_scheduler_process(
...
@@ -2852,7 +2846,6 @@ def run_scheduler_process(
moe_ep_rank
,
moe_ep_rank
,
pp_rank
,
pp_rank
,
dp_rank
,
dp_rank
,
dp_balance_meta
=
balance_meta
,
)
)
pipe_writer
.
send
(
pipe_writer
.
send
(
{
{
...
...
python/sglang/srt/managers/scheduler_metrics_mixin.py
View file @
7ff740a6
...
@@ -12,7 +12,6 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
...
@@ -12,7 +12,6 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
from
sglang.srt.managers.io_struct
import
TokenizedGenerateReqInput
from
sglang.srt.managers.io_struct
import
TokenizedGenerateReqInput
from
sglang.srt.managers.schedule_policy
import
PrefillAdder
from
sglang.srt.managers.schedule_policy
import
PrefillAdder
from
sglang.srt.managers.scheduler
import
Req
,
ScheduleBatch
from
sglang.srt.managers.scheduler
import
Req
,
ScheduleBatch
from
sglang.srt.managers.utils
import
DPBalanceMeta
from
sglang.srt.metrics.collector
import
SchedulerMetricsCollector
,
SchedulerStats
from
sglang.srt.metrics.collector
import
SchedulerMetricsCollector
,
SchedulerStats
from
sglang.srt.utils
import
get_bool_env_var
from
sglang.srt.utils
import
get_bool_env_var
...
@@ -64,16 +63,6 @@ class SchedulerMetricsMixin:
...
@@ -64,16 +63,6 @@ class SchedulerMetricsMixin:
labels
[
"dp_rank"
]
=
dp_rank
labels
[
"dp_rank"
]
=
dp_rank
self
.
metrics_collector
=
SchedulerMetricsCollector
(
labels
=
labels
)
self
.
metrics_collector
=
SchedulerMetricsCollector
(
labels
=
labels
)
def
init_dp_balance
(
self
:
Scheduler
,
dp_balance_meta
:
Optional
[
DPBalanceMeta
]):
self
.
balance_meta
=
dp_balance_meta
if
(
self
.
server_args
.
enable_dp_attention
and
self
.
server_args
.
load_balance_method
==
"minimum_tokens"
):
assert
dp_balance_meta
is
not
None
self
.
recv_dp_balance_id_this_term
=
[]
def
init_kv_events
(
self
:
Scheduler
,
kv_events_config
:
Optional
[
str
]):
def
init_kv_events
(
self
:
Scheduler
,
kv_events_config
:
Optional
[
str
]):
if
self
.
enable_kv_cache_events
:
if
self
.
enable_kv_cache_events
:
self
.
kv_event_publisher
=
EventPublisherFactory
.
create
(
self
.
kv_event_publisher
=
EventPublisherFactory
.
create
(
...
@@ -319,91 +308,6 @@ class SchedulerMetricsMixin:
...
@@ -319,91 +308,6 @@ class SchedulerMetricsMixin:
batch
=
KVEventBatch
(
ts
=
time
.
time
(),
events
=
events
)
batch
=
KVEventBatch
(
ts
=
time
.
time
(),
events
=
events
)
self
.
kv_event_publisher
.
publish
(
batch
)
self
.
kv_event_publisher
.
publish
(
batch
)
def
maybe_update_dp_balance_data
(
self
:
Scheduler
,
recv_req
:
TokenizedGenerateReqInput
):
if
(
self
.
server_args
.
enable_dp_attention
and
self
.
server_args
.
load_balance_method
==
"minimum_tokens"
):
self
.
recv_dp_balance_id_this_term
.
append
(
recv_req
.
dp_balance_id
)
def
maybe_handle_dp_balance_data
(
self
:
Scheduler
):
if
(
self
.
server_args
.
load_balance_method
==
"minimum_tokens"
and
self
.
forward_ct
%
40
==
0
):
holding_tokens
=
self
.
get_load
().
num_tokens
new_recv_dp_balance_id_list
,
holding_token_list
=
(
self
.
gather_dp_balance_info
(
holding_tokens
)
)
self
.
recv_dp_balance_id_this_term
.
clear
()
if
self
.
tp_rank
==
0
:
# only first worker write info
self
.
write_shared_dp_balance_info
(
new_recv_dp_balance_id_list
,
holding_token_list
)
def
gather_dp_balance_info
(
self
:
Scheduler
,
holding_tokens_list
)
->
Union
[
None
,
List
[
List
[
int
]]]:
"""gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
recv_list
=
self
.
recv_dp_balance_id_this_term
assert
len
(
recv_list
)
<=
511
,
(
"The number of requests received this round is too large. "
"Please increase gather_tensor_size and onfly_info_size."
)
# The maximum size of the tensor used for gathering data from all workers.
gather_tensor_size
=
512
# recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
recv_tensor
=
torch
.
zeros
(
gather_tensor_size
,
dtype
=
torch
.
int32
)
recv_tensor
[
0
]
=
holding_tokens_list
recv_tensor
[
1
]
=
len
(
recv_list
)
# The first element is the length of the list.
recv_tensor
[
2
:
len
(
recv_list
)
+
2
]
=
torch
.
tensor
(
recv_list
,
dtype
=
torch
.
int32
)
if
self
.
tp_rank
==
0
:
gathered_list
=
[
torch
.
zeros
(
gather_tensor_size
,
dtype
=
torch
.
int32
)
for
_
in
range
(
self
.
balance_meta
.
num_workers
)
]
else
:
gathered_list
=
None
torch
.
distributed
.
gather
(
recv_tensor
,
gathered_list
,
group
=
self
.
tp_cpu_group
)
gathered_id_list_per_worker
=
None
if
self
.
tp_rank
==
0
:
gathered_id_list_per_worker
=
[]
holding_tokens_list
=
[]
for
tensor
in
gathered_list
:
holding_tokens_list
.
append
(
tensor
[
0
].
item
())
list_length
=
tensor
[
1
].
item
()
gathered_id_list_per_worker
.
append
(
tensor
[
2
:
list_length
+
2
].
tolist
())
return
gathered_id_list_per_worker
,
holding_tokens_list
def
write_shared_dp_balance_info
(
self
:
Scheduler
,
new_recv_rid_lists
,
local_tokens
):
meta
=
self
.
balance_meta
with
meta
.
mutex
:
onfly_list
:
List
[
Dict
[
int
,
int
]]
=
meta
.
get_shared_onfly
()
assert
len
(
new_recv_rid_lists
)
==
len
(
onfly_list
),
"num_worker not equal"
# 1.Check if the rid received by each worker this round is present in onfly.
# If it is, remove the corresponding onfly item.
worker_id
=
0
for
new_recv_rids
,
on_fly_reqs
in
zip
(
new_recv_rid_lists
,
onfly_list
):
for
new_recv_rid
in
new_recv_rids
:
assert
(
new_recv_rid
in
on_fly_reqs
),
f
"
{
new_recv_rid
=
}
not in
{
worker_id
=
}
{
on_fly_reqs
=
}
, data consistency is wrong"
del
on_fly_reqs
[
new_recv_rid
]
worker_id
+=
1
# 2. Atomically write local_tokens and onfly into shm under the mutex
meta
.
set_shared_onfly_info
(
onfly_list
)
meta
.
set_shared_local_tokens
(
local_tokens
)
def
calculate_utilization
(
self
):
def
calculate_utilization
(
self
):
if
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
if
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
self
.
stats
.
utilization
=
-
1
self
.
stats
.
utilization
=
-
1
...
...
python/sglang/srt/managers/utils.py
View file @
7ff740a6
...
@@ -96,46 +96,3 @@ def get_logprob_from_pp_outputs(
...
@@ -96,46 +96,3 @@ def get_logprob_from_pp_outputs(
]
]
return
logits_output
,
extend_input_len_per_req
,
extend_logprob_start_len_per_req
return
logits_output
,
extend_input_len_per_req
,
extend_logprob_start_len_per_req
class
DPBalanceMeta
:
"""
This class will be use in scheduler and dp controller
"""
def
__init__
(
self
,
num_workers
:
int
):
self
.
num_workers
=
num_workers
self
.
_manager
=
mp
.
Manager
()
self
.
mutex
=
self
.
_manager
.
Lock
()
init_local_tokens
=
[
0
]
*
self
.
num_workers
init_onfly_info
=
[
self
.
_manager
.
dict
()
for
_
in
range
(
self
.
num_workers
)]
self
.
shared_state
=
self
.
_manager
.
Namespace
()
self
.
shared_state
.
local_tokens
=
self
.
_manager
.
list
(
init_local_tokens
)
self
.
shared_state
.
onfly_info
=
self
.
_manager
.
list
(
init_onfly_info
)
def
destructor
(
self
):
# we must destructor this class manually
self
.
_manager
.
shutdown
()
def
get_shared_onfly
(
self
)
->
List
[
Dict
[
int
,
int
]]:
return
[
dict
(
d
)
for
d
in
self
.
shared_state
.
onfly_info
]
def
set_shared_onfly_info
(
self
,
data
:
List
[
Dict
[
int
,
int
]]):
self
.
shared_state
.
onfly_info
=
data
def
get_shared_local_tokens
(
self
)
->
List
[
int
]:
return
list
(
self
.
shared_state
.
local_tokens
)
def
set_shared_local_tokens
(
self
,
data
:
List
[
int
]):
self
.
shared_state
.
local_tokens
=
data
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
del
state
[
"_manager"
]
return
state
def
__setstate__
(
self
,
state
):
self
.
__dict__
.
update
(
state
)
self
.
_manager
=
None
sgl-router/src/proto/sglang_scheduler.proto
View file @
7ff740a6
...
@@ -120,11 +120,8 @@ message GenerateRequest {
...
@@ -120,11 +120,8 @@ message GenerateRequest {
// Data parallel routing
// Data parallel routing
int32
data_parallel_rank
=
16
;
int32
data_parallel_rank
=
16
;
// For load balancing
int32
dp_balance_id
=
17
;
// Whether client wants streaming response
// Whether client wants streaming response
bool
stream
=
1
8
;
bool
stream
=
1
7
;
}
}
message
TokenizedInput
{
message
TokenizedInput
{
...
...
test/srt/test_dp_attention.py
View file @
7ff740a6
...
@@ -124,47 +124,5 @@ class TestDPAttentionDP2TP2DeepseekV3MTP(CustomTestCase):
...
@@ -124,47 +124,5 @@ class TestDPAttentionDP2TP2DeepseekV3MTP(CustomTestCase):
self
.
assertGreater
(
avg_spec_accept_length
,
2.5
)
self
.
assertGreater
(
avg_spec_accept_length
,
2.5
)
class
TestDPAttentionMinimumTokenLoadBalance
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--trust-remote-code"
,
"--tp"
,
"2"
,
"--enable-dp-attention"
,
"--dp"
,
"2"
,
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"2"
,
"--load-balance-method"
,
"minimum_tokens"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
print
(
f
"
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"score"
],
0.8
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment