Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
53ca1552
Unverified
Commit
53ca1552
authored
Sep 11, 2025
by
Chang Su
Committed by
GitHub
Sep 11, 2025
Browse files
Implement Standalone gRPC Server for SGLang Python Scheduler (#10283)
parent
a23bdeaf
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
2486 additions
and
285 deletions
+2486
-285
.pre-commit-config.yaml
.pre-commit-config.yaml
+8
-2
python/sglang/srt/entrypoints/grpc_request_manager.py
python/sglang/srt/entrypoints/grpc_request_manager.py
+580
-0
python/sglang/srt/entrypoints/grpc_server.py
python/sglang/srt/entrypoints/grpc_server.py
+680
-0
python/sglang/srt/grpc/__init__.py
python/sglang/srt/grpc/__init__.py
+1
-0
python/sglang/srt/grpc/sglang_scheduler.proto
python/sglang/srt/grpc/sglang_scheduler.proto
+389
-0
python/sglang/srt/grpc/sglang_scheduler_pb2.py
python/sglang/srt/grpc/sglang_scheduler_pb2.py
+106
-0
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+427
-0
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+236
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+1
-0
sgl-router/src/grpc/client.rs
sgl-router/src/grpc/client.rs
+36
-109
sgl-router/src/proto/sglang_scheduler.proto
sgl-router/src/proto/sglang_scheduler.proto
+22
-174
No files found.
.pre-commit-config.yaml
View file @
53ca1552
...
...
@@ -22,17 +22,19 @@ repos:
rev
:
5.13.2
hooks
:
-
id
:
isort
exclude
:
'
^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
rev
:
v0.11.7
hooks
:
-
id
:
ruff
args
:
[
--select=F401
,
--fixable=F401
]
files
:
^(benchmark/|docs/|examples/)
exclude
:
\.ipynb$
exclude
:
\.ipynb$
|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
-
repo
:
https://github.com/psf/black
rev
:
24.10.0
hooks
:
-
id
:
black-jupyter
exclude
:
'
^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.4.1
hooks
:
...
...
@@ -42,7 +44,11 @@ repos:
exclude
:
|
(?x)^(
test/srt/test_reasoning_parser\.py|
docs/advanced_features/vlm_query\.ipynb
docs/advanced_features/vlm_query\.ipynb|
python/sglang/srt/grpc/.*_pb2\.py|
python/sglang/srt/grpc/.*_pb2_grpc\.py|
python/sglang/srt/grpc/.*_pb2\.pyi|
python/sglang/srt/grpc/.*_pb2_grpc\.pyi
)$
-
repo
:
https://github.com/pre-commit/mirrors-clang-format
rev
:
v18.1.8
...
...
python/sglang/srt/entrypoints/grpc_request_manager.py
0 → 100644
View file @
53ca1552
This diff is collapsed.
Click to expand it.
python/sglang/srt/entrypoints/grpc_server.py
0 → 100644
View file @
53ca1552
This diff is collapsed.
Click to expand it.
python/sglang/srt/grpc/__init__.py
0 → 100644
View file @
53ca1552
# SGLang gRPC module
python/sglang/srt/grpc/sglang_scheduler.proto
0 → 100644
View file @
53ca1552
syntax
=
"proto3"
;
package
sglang
.
grpc.scheduler
;
import
"google/protobuf/timestamp.proto"
;
import
"google/protobuf/struct.proto"
;
// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
service
SglangScheduler
{
// Submit a generation request (supports streaming)
rpc
Generate
(
GenerateRequest
)
returns
(
stream
GenerateResponse
);
// Submit an embedding request
rpc
Embed
(
EmbedRequest
)
returns
(
EmbedResponse
);
// Health check and metrics
rpc
HealthCheck
(
HealthCheckRequest
)
returns
(
HealthCheckResponse
);
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
}
// =====================
// Common Types
// =====================
// Sampling parameters matching SGLang's SamplingParams
message
SamplingParams
{
float
temperature
=
1
;
float
top_p
=
2
;
int32
top_k
=
3
;
float
min_p
=
4
;
float
frequency_penalty
=
5
;
float
presence_penalty
=
6
;
float
repetition_penalty
=
7
;
int32
max_new_tokens
=
8
;
repeated
string
stop
=
9
;
repeated
int32
stop_token_ids
=
10
;
bool
skip_special_tokens
=
11
;
bool
spaces_between_special_tokens
=
12
;
// Structured generation
oneof
constraint
{
string
regex
=
13
;
string
json_schema
=
14
;
string
ebnf_grammar
=
15
;
}
// LoRA adapter
string
lora_path
=
16
;
// Speculative decoding
int32
n
=
17
;
// Number of samples
// Token healing
bool
token_healing
=
18
;
// Additional parameters
int32
min_new_tokens
=
19
;
bool
ignore_eos
=
20
;
bool
no_stop_trim
=
21
;
int32
stream_interval
=
22
;
map
<
string
,
float
>
logit_bias
=
23
;
string
structural_tag
=
24
;
// Custom parameters for extensibility
google.protobuf.Struct
custom_params
=
25
;
}
// Disaggregated serving parameters
message
DisaggregatedParams
{
string
bootstrap_host
=
1
;
int32
bootstrap_port
=
2
;
int32
bootstrap_room
=
3
;
}
// =====================
// Generate Request
// =====================
message
GenerateRequest
{
string
request_id
=
1
;
// Input must be tokenized (no raw text)
TokenizedInput
tokenized
=
2
;
// Multimodal inputs
MultimodalInputs
mm_inputs
=
3
;
// Generation parameters
SamplingParams
sampling_params
=
4
;
// Return options
bool
return_logprob
=
5
;
int32
logprob_start_len
=
6
;
int32
top_logprobs_num
=
7
;
repeated
int32
token_ids_logprob
=
8
;
bool
return_hidden_states
=
9
;
// For disaggregated serving
DisaggregatedParams
disaggregated_params
=
10
;
// Custom logit processor (serialized)
string
custom_logit_processor
=
11
;
// Request metadata
google.protobuf.Timestamp
timestamp
=
12
;
bool
log_metrics
=
13
;
// Input embeddings (alternative to text/tokens)
repeated
float
input_embeds
=
14
;
// LoRA adapter ID (if pre-loaded)
string
lora_id
=
15
;
// Data parallel routing
int32
data_parallel_rank
=
16
;
// For load balancing
int32
dp_balance_id
=
17
;
}
message
TokenizedInput
{
string
original_text
=
1
;
// For reference
repeated
int32
input_ids
=
2
;
}
message
MultimodalInputs
{
// Simplified multimodal handling - actual data processed by tokenizer
repeated
string
image_urls
=
1
;
repeated
string
video_urls
=
2
;
repeated
string
audio_urls
=
3
;
// Pre-processed multimodal features (if available)
google.protobuf.Struct
processed_features
=
4
;
// Raw data for direct processing
repeated
bytes
image_data
=
5
;
repeated
bytes
video_data
=
6
;
repeated
bytes
audio_data
=
7
;
// Modality metadata
repeated
string
modalities
=
8
;
}
// =====================
// Generate Response
// =====================
message
GenerateResponse
{
string
request_id
=
1
;
// Response type
oneof
response
{
GenerateStreamChunk
chunk
=
2
;
GenerateComplete
complete
=
3
;
GenerateError
error
=
4
;
}
}
message
GenerateStreamChunk
{
// Generated token
int32
token_id
=
1
;
string
text
=
2
;
// Cumulative counts
int32
prompt_tokens
=
3
;
int32
completion_tokens
=
4
;
int32
cached_tokens
=
5
;
// Logprobs (if requested)
LogProbs
logprobs
=
6
;
// Hidden states (if requested)
repeated
float
hidden_states
=
7
;
// Metadata
float
generation_time
=
8
;
// Time to generate this token
int32
queue_time
=
9
;
// Time spent in queue
}
message
GenerateComplete
{
// Final output
repeated
int32
output_ids
=
1
;
string
output_text
=
2
;
// Finish reason
enum
FinishReason
{
// The model generated a stop sequence.
STOP
=
0
;
// The model reached the maximum generation length.
LENGTH
=
1
;
// The model generated an end-of-sequence (EOS) token.
EOS_TOKEN
=
2
;
// The model generated a user-provided stop string.
STOP_STR
=
3
;
// The request was aborted by the user or system.
ABORT
=
4
;
}
FinishReason
finish_reason
=
3
;
// All logprobs if requested
repeated
LogProbs
all_logprobs
=
11
;
// All hidden states if requested
repeated
HiddenStates
all_hidden_states
=
12
;
}
message
GenerateError
{
string
message
=
1
;
string
http_status_code
=
2
;
string
details
=
3
;
}
message
LogProbs
{
repeated
float
token_logprobs
=
1
;
repeated
int32
token_ids
=
2
;
// Top logprobs at each position
repeated
TopLogProbs
top_logprobs
=
3
;
// Decoded text for tokens
repeated
string
token_texts
=
4
;
}
message
TopLogProbs
{
repeated
float
values
=
1
;
repeated
int32
token_ids
=
2
;
repeated
string
token_texts
=
3
;
}
message
HiddenStates
{
repeated
float
values
=
1
;
int32
layer
=
2
;
int32
position
=
3
;
}
// =====================
// Embedding Request
// =====================
message
EmbedRequest
{
string
request_id
=
1
;
// Input must be tokenized (no raw text)
TokenizedInput
tokenized
=
2
;
// Multimodal inputs
MultimodalInputs
mm_inputs
=
4
;
// Dummy sampling params for compatibility
// EmbedRequest doesn't use sampling_params
SamplingParams
sampling_params
=
5
;
bool
log_metrics
=
6
;
// Token type IDs for models that require them
repeated
int32
token_type_ids
=
7
;
// Data parallel routing
int32
data_parallel_rank
=
8
;
// For cross-encoder requests
bool
is_cross_encoder
=
9
;
repeated
string
texts
=
10
;
// For cross-encoder batch
}
message
EmbedResponse
{
string
request_id
=
1
;
oneof
response
{
EmbedComplete
complete
=
2
;
EmbedError
error
=
3
;
}
}
message
EmbedComplete
{
repeated
float
embedding
=
1
;
int32
prompt_tokens
=
2
;
int32
cached_tokens
=
3
;
// Additional metadata
int32
embedding_dim
=
4
;
float
generation_time
=
5
;
// For batch embeddings
repeated
Embedding
batch_embeddings
=
6
;
}
message
Embedding
{
repeated
float
values
=
1
;
int32
index
=
2
;
}
message
EmbedError
{
string
message
=
1
;
string
code
=
2
;
string
details
=
3
;
}
// =====================
// Management Operations
// =====================
message
HealthCheckRequest
{
// Input for health test generation (must be tokenized)
TokenizedInput
tokenized
=
1
;
}
message
HealthCheckResponse
{
bool
healthy
=
1
;
string
message
=
2
;
}
message
AbortRequest
{
string
request_id
=
1
;
string
reason
=
2
;
}
message
AbortResponse
{
bool
success
=
1
;
string
message
=
2
;
}
// =====================
// Additional Operations (Future)
// =====================
// Load LoRA adapter
message
LoadLoRARequest
{
string
adapter_id
=
1
;
string
adapter_path
=
2
;
int32
rank
=
3
;
}
message
LoadLoRAResponse
{
bool
success
=
1
;
string
adapter_id
=
2
;
string
message
=
3
;
}
// Unload LoRA adapter
message
UnloadLoRARequest
{
string
adapter_id
=
1
;
}
message
UnloadLoRAResponse
{
bool
success
=
1
;
string
message
=
2
;
}
// Update weights
message
UpdateWeightsRequest
{
oneof
source
{
string
disk_path
=
1
;
bytes
tensor_data
=
2
;
string
remote_url
=
3
;
}
string
weight_name
=
4
;
}
message
UpdateWeightsResponse
{
bool
success
=
1
;
string
message
=
2
;
}
// Get internal state for debugging
message
GetInternalStateRequest
{
repeated
string
state_keys
=
1
;
}
message
GetInternalStateResponse
{
google.protobuf.Struct
state
=
1
;
}
// Set internal state for testing
message
SetInternalStateRequest
{
google.protobuf.Struct
state
=
1
;
}
message
SetInternalStateResponse
{
bool
success
=
1
;
string
message
=
2
;
}
python/sglang/srt/grpc/sglang_scheduler_pb2.py
0 → 100644
View file @
53ca1552
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: sglang_scheduler.proto
# Protobuf Python Version: 6.31.1
"""Generated protocol buffer code."""
from
google.protobuf
import
descriptor
as
_descriptor
from
google.protobuf
import
descriptor_pool
as
_descriptor_pool
from
google.protobuf
import
runtime_version
as
_runtime_version
from
google.protobuf
import
symbol_database
as
_symbol_database
from
google.protobuf.internal
import
builder
as
_builder
_runtime_version
.
ValidateProtobufRuntimeVersion
(
_runtime_version
.
Domain
.
PUBLIC
,
6
,
31
,
1
,
''
,
'sglang_scheduler.proto'
)
# @@protoc_insertion_point(imports)
_sym_db
=
_symbol_database
.
Default
()
from
google.protobuf
import
timestamp_pb2
as
google_dot_protobuf_dot_timestamp__pb2
from
google.protobuf
import
struct_pb2
as
google_dot_protobuf_dot_struct__pb2
DESCRIPTOR
=
_descriptor_pool
.
Default
().
AddSerializedFile
(
b
'
\n\x16
sglang_scheduler.proto
\x12\x15
sglang.grpc.scheduler
\x1a\x1f
google/protobuf/timestamp.proto
\x1a\x1c
google/protobuf/struct.proto
\"\xc7\x05\n\x0e
SamplingParams
\x12\x13\n\x0b
temperature
\x18\x01
\x01
(
\x02\x12\r\n\x05
top_p
\x18\x02
\x01
(
\x02\x12\r\n\x05
top_k
\x18\x03
\x01
(
\x05\x12\r\n\x05
min_p
\x18\x04
\x01
(
\x02\x12\x19\n\x11\x66
requency_penalty
\x18\x05
\x01
(
\x02\x12\x18\n\x10
presence_penalty
\x18\x06
\x01
(
\x02\x12\x1a\n\x12
repetition_penalty
\x18\x07
\x01
(
\x02\x12\x16\n\x0e
max_new_tokens
\x18\x08
\x01
(
\x05\x12\x0c\n\x04
stop
\x18\t
\x03
(
\t\x12\x16\n\x0e
stop_token_ids
\x18\n
\x03
(
\x05\x12\x1b\n\x13
skip_special_tokens
\x18\x0b
\x01
(
\x08\x12
%
\n\x1d
spaces_between_special_tokens
\x18\x0c
\x01
(
\x08\x12\x0f\n\x05
regex
\x18\r
\x01
(
\t
H
\x00\x12\x15\n\x0b
json_schema
\x18\x0e
\x01
(
\t
H
\x00\x12\x16\n\x0c\x65\x62
nf_grammar
\x18\x0f
\x01
(
\t
H
\x00\x12\x11\n\t
lora_path
\x18\x10
\x01
(
\t\x12\t\n\x01
n
\x18\x11
\x01
(
\x05\x12\x15\n\r
token_healing
\x18\x12
\x01
(
\x08\x12\x16\n\x0e
min_new_tokens
\x18\x13
\x01
(
\x05\x12\x12\n\n
ignore_eos
\x18\x14
\x01
(
\x08\x12\x14\n\x0c
no_stop_trim
\x18\x15
\x01
(
\x08\x12\x17\n\x0f
stream_interval
\x18\x16
\x01
(
\x05\x12
H
\n\n
logit_bias
\x18\x17
\x03
(
\x0b\x32\x34
.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry
\x12\x16\n\x0e
structural_tag
\x18\x18
\x01
(
\t\x12
.
\n\r
custom_params
\x18\x19
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x1a\x30\n\x0e
LogitBiasEntry
\x12\x0b\n\x03
key
\x18\x01
\x01
(
\t\x12\r\n\x05
value
\x18\x02
\x01
(
\x02
:
\x02\x38\x01\x42\x0c\n\n
constraint
\"
]
\n\x13\x44
isaggregatedParams
\x12\x16\n\x0e\x62
ootstrap_host
\x18\x01
\x01
(
\t\x12\x16\n\x0e\x62
ootstrap_port
\x18\x02
\x01
(
\x05\x12\x16\n\x0e\x62
ootstrap_room
\x18\x03
\x01
(
\x05\"\xe9\x04\n\x0f
GenerateRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x04
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x16\n\x0e
return_logprob
\x18\x05
\x01
(
\x08\x12\x19\n\x11
logprob_start_len
\x18\x06
\x01
(
\x05\x12\x18\n\x10
top_logprobs_num
\x18\x07
\x01
(
\x05\x12\x19\n\x11
token_ids_logprob
\x18\x08
\x03
(
\x05\x12\x1c\n\x14
return_hidden_states
\x18\t
\x01
(
\x08\x12
H
\n\x14\x64
isaggregated_params
\x18\n
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.DisaggregatedParams
\x12\x1e\n\x16\x63
ustom_logit_processor
\x18\x0b
\x01
(
\t\x12
-
\n\t
timestamp
\x18\x0c
\x01
(
\x0b\x32\x1a
.google.protobuf.Timestamp
\x12\x13\n\x0b
log_metrics
\x18\r
\x01
(
\x08\x12\x14\n\x0c
input_embeds
\x18\x0e
\x03
(
\x02\x12\x0f\n\x07
lora_id
\x18\x0f
\x01
(
\t\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x10
\x01
(
\x05\x12\x15\n\r
dp_balance_id
\x18\x11
\x01
(
\x05\"
:
\n\x0e
TokenizedInput
\x12\x15\n\r
original_text
\x18\x01
\x01
(
\t\x12\x11\n\t
input_ids
\x18\x02
\x03
(
\x05\"\xd3\x01\n\x10
MultimodalInputs
\x12\x12\n\n
image_urls
\x18\x01
\x03
(
\t\x12\x12\n\n
video_urls
\x18\x02
\x03
(
\t\x12\x12\n\n
audio_urls
\x18\x03
\x03
(
\t\x12\x33\n\x12
processed_features
\x18\x04
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\x12\x12\n\n
image_data
\x18\x05
\x03
(
\x0c\x12\x12\n\n
video_data
\x18\x06
\x03
(
\x0c\x12\x12\n\n
audio_data
\x18\x07
\x03
(
\x0c\x12\x12\n\n
modalities
\x18\x08
\x03
(
\t\"\xe3\x01\n\x10
GenerateResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12
;
\n\x05\x63
hunk
\x18\x02
\x01
(
\x0b\x32
*.sglang.grpc.scheduler.GenerateStreamChunkH
\x00\x12
;
\n\x08\x63
omplete
\x18\x03
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.GenerateCompleteH
\x00\x12\x35\n\x05\x65
rror
\x18\x04
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.GenerateErrorH
\x00\x42\n\n\x08
response
\"\xf5\x01\n\x13
GenerateStreamChunk
\x12\x10\n\x08
token_id
\x18\x01
\x01
(
\x05\x12\x0c\n\x04
text
\x18\x02
\x01
(
\t\x12\x15\n\r
prompt_tokens
\x18\x03
\x01
(
\x05\x12\x19\n\x11\x63
ompletion_tokens
\x18\x04
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x05
\x01
(
\x05\x12\x31\n\x08
logprobs
\x18\x06
\x01
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12\x15\n\r
hidden_states
\x18\x07
\x03
(
\x02\x12\x17\n\x0f
generation_time
\x18\x08
\x01
(
\x02\x12\x12\n\n
queue_time
\x18\t
\x01
(
\x05\"\xcd\x02\n\x10
GenerateComplete
\x12\x12\n\n
output_ids
\x18\x01
\x03
(
\x05\x12\x13\n\x0b
output_text
\x18\x02
\x01
(
\t\x12
K
\n\r
finish_reason
\x18\x03
\x01
(
\x0e\x32\x34
.sglang.grpc.scheduler.GenerateComplete.FinishReason
\x12\x35\n\x0c\x61
ll_logprobs
\x18\x0b
\x03
(
\x0b\x32\x1f
.sglang.grpc.scheduler.LogProbs
\x12
>
\n\x11\x61
ll_hidden_states
\x18\x0c
\x03
(
\x0b\x32
#.sglang.grpc.scheduler.HiddenStates
\"
L
\n\x0c\x46
inishReason
\x12\x08\n\x04
STOP
\x10\x00\x12\n\n\x06
LENGTH
\x10\x01\x12\r\n\t
EOS_TOKEN
\x10\x02\x12\x0c\n\x08
STOP_STR
\x10\x03\x12\t\n\x05\x41\x42
ORT
\x10\x04\"
K
\n\r
GenerateError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x18\n\x10
http_status_code
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"\x84\x01\n\x08
LogProbs
\x12\x16\n\x0e
token_logprobs
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x38\n\x0c
top_logprobs
\x18\x03
\x03
(
\x0b\x32\"
.sglang.grpc.scheduler.TopLogProbs
\x12\x13\n\x0b
token_texts
\x18\x04
\x03
(
\t\"
E
\n\x0b
TopLogProbs
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\x11\n\t
token_ids
\x18\x02
\x03
(
\x05\x12\x13\n\x0b
token_texts
\x18\x03
\x03
(
\t\"
?
\n\x0c
HiddenStates
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
layer
\x18\x02
\x01
(
\x05\x12\x10\n\x08
position
\x18\x03
\x01
(
\x05\"\xca\x02\n\x0c\x45
mbedRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\t
tokenized
\x18\x02
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\x12
:
\n\t
mm_inputs
\x18\x04
\x01
(
\x0b\x32\'
.sglang.grpc.scheduler.MultimodalInputs
\x12
>
\n\x0f
sampling_params
\x18\x05
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.SamplingParams
\x12\x13\n\x0b
log_metrics
\x18\x06
\x01
(
\x08\x12\x16\n\x0e
token_type_ids
\x18\x07
\x03
(
\x05\x12\x1a\n\x12\x64\x61
ta_parallel_rank
\x18\x08
\x01
(
\x05\x12\x18\n\x10
is_cross_encoder
\x18\t
\x01
(
\x08\x12\r\n\x05
texts
\x18\n
\x03
(
\t\"\x9d\x01\n\r
EmbedResponse
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x38\n\x08\x63
omplete
\x18\x02
\x01
(
\x0b\x32
$.sglang.grpc.scheduler.EmbedCompleteH
\x00\x12\x32\n\x05\x65
rror
\x18\x03
\x01
(
\x0b\x32
!.sglang.grpc.scheduler.EmbedErrorH
\x00\x42\n\n\x08
response
\"\xbc\x01\n\r
EmbedComplete
\x12\x11\n\t
embedding
\x18\x01
\x03
(
\x02\x12\x15\n\r
prompt_tokens
\x18\x02
\x01
(
\x05\x12\x15\n\r
cached_tokens
\x18\x03
\x01
(
\x05\x12\x15\n\r
embedding_dim
\x18\x04
\x01
(
\x05\x12\x17\n\x0f
generation_time
\x18\x05
\x01
(
\x02\x12
:
\n\x10\x62\x61
tch_embeddings
\x18\x06
\x03
(
\x0b\x32
.sglang.grpc.scheduler.Embedding
\"
*
\n\t
Embedding
\x12\x0e\n\x06
values
\x18\x01
\x03
(
\x02\x12\r\n\x05
index
\x18\x02
\x01
(
\x05\"
<
\n\n
EmbedError
\x12\x0f\n\x07
message
\x18\x01
\x01
(
\t\x12\x0c\n\x04\x63
ode
\x18\x02
\x01
(
\t\x12\x0f\n\x07\x64\x65
tails
\x18\x03
\x01
(
\t\"
N
\n\x12
HealthCheckRequest
\x12\x38\n\t
tokenized
\x18\x01
\x01
(
\x0b\x32
%.sglang.grpc.scheduler.TokenizedInput
\"
7
\n\x13
HealthCheckResponse
\x12\x0f\n\x07
healthy
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
2
\n\x0c\x41\x62
ortRequest
\x12\x12\n\n
request_id
\x18\x01
\x01
(
\t\x12\x0e\n\x06
reason
\x18\x02
\x01
(
\t\"
1
\n\r
AbortResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
I
\n\x0f
LoadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\x12\x14\n\x0c\x61\x64\x61
pter_path
\x18\x02
\x01
(
\t\x12\x0c\n\x04
rank
\x18\x03
\x01
(
\x05\"
H
\n\x10
LoadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x12\n\n
adapter_id
\x18\x02
\x01
(
\t\x12\x0f\n\x07
message
\x18\x03
\x01
(
\t\"\'\n\x11
UnloadLoRARequest
\x12\x12\n\n
adapter_id
\x18\x01
\x01
(
\t\"
6
\n\x12
UnloadLoRAResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
w
\n\x14
UpdateWeightsRequest
\x12\x13\n\t
disk_path
\x18\x01
\x01
(
\t
H
\x00\x12\x15\n\x0b
tensor_data
\x18\x02
\x01
(
\x0c
H
\x00\x12\x14\n\n
remote_url
\x18\x03
\x01
(
\t
H
\x00\x12\x13\n\x0b
weight_name
\x18\x04
\x01
(
\t
B
\x08\n\x06
source
\"
9
\n\x15
UpdateWeightsResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t\"
-
\n\x17
GetInternalStateRequest
\x12\x12\n\n
state_keys
\x18\x01
\x03
(
\t\"
B
\n\x18
GetInternalStateResponse
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
A
\n\x17
SetInternalStateRequest
\x12
&
\n\x05
state
\x18\x01
\x01
(
\x0b\x32\x17
.google.protobuf.Struct
\"
<
\n\x18
SetInternalStateResponse
\x12\x0f\n\x07
success
\x18\x01
\x01
(
\x08\x12\x0f\n\x07
message
\x18\x02
\x01
(
\t
2
\xfe\x02\n\x0f
SglangScheduler
\x12
]
\n\x08
Generate
\x12
&.sglang.grpc.scheduler.GenerateRequest
\x1a\'
.sglang.grpc.scheduler.GenerateResponse0
\x01\x12
R
\n\x05\x45
mbed
\x12
#.sglang.grpc.scheduler.EmbedRequest
\x1a
$.sglang.grpc.scheduler.EmbedResponse
\x12\x64\n\x0b
HealthCheck
\x12
).sglang.grpc.scheduler.HealthCheckRequest
\x1a
*.sglang.grpc.scheduler.HealthCheckResponse
\x12
R
\n\x05\x41\x62
ort
\x12
#.sglang.grpc.scheduler.AbortRequest
\x1a
$.sglang.grpc.scheduler.AbortResponseb
\x06
proto3'
)
_globals
=
globals
()
_builder
.
BuildMessageAndEnumDescriptors
(
DESCRIPTOR
,
_globals
)
_builder
.
BuildTopDescriptorsAndMessages
(
DESCRIPTOR
,
'sglang_scheduler_pb2'
,
_globals
)
if
not
_descriptor
.
_USE_C_DESCRIPTORS
:
DESCRIPTOR
.
_loaded_options
=
None
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_loaded_options
=
None
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_options
=
b
'8
\001
'
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_start
=
113
_globals
[
'_SAMPLINGPARAMS'
].
_serialized_end
=
824
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_start
=
762
_globals
[
'_SAMPLINGPARAMS_LOGITBIASENTRY'
].
_serialized_end
=
810
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_start
=
826
_globals
[
'_DISAGGREGATEDPARAMS'
].
_serialized_end
=
919
_globals
[
'_GENERATEREQUEST'
].
_serialized_start
=
922
_globals
[
'_GENERATEREQUEST'
].
_serialized_end
=
1539
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_start
=
1541
_globals
[
'_TOKENIZEDINPUT'
].
_serialized_end
=
1599
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_start
=
1602
_globals
[
'_MULTIMODALINPUTS'
].
_serialized_end
=
1813
_globals
[
'_GENERATERESPONSE'
].
_serialized_start
=
1816
_globals
[
'_GENERATERESPONSE'
].
_serialized_end
=
2043
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_start
=
2046
_globals
[
'_GENERATESTREAMCHUNK'
].
_serialized_end
=
2291
_globals
[
'_GENERATECOMPLETE'
].
_serialized_start
=
2294
_globals
[
'_GENERATECOMPLETE'
].
_serialized_end
=
2627
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_start
=
2551
_globals
[
'_GENERATECOMPLETE_FINISHREASON'
].
_serialized_end
=
2627
_globals
[
'_GENERATEERROR'
].
_serialized_start
=
2629
_globals
[
'_GENERATEERROR'
].
_serialized_end
=
2704
_globals
[
'_LOGPROBS'
].
_serialized_start
=
2707
_globals
[
'_LOGPROBS'
].
_serialized_end
=
2839
_globals
[
'_TOPLOGPROBS'
].
_serialized_start
=
2841
_globals
[
'_TOPLOGPROBS'
].
_serialized_end
=
2910
_globals
[
'_HIDDENSTATES'
].
_serialized_start
=
2912
_globals
[
'_HIDDENSTATES'
].
_serialized_end
=
2975
_globals
[
'_EMBEDREQUEST'
].
_serialized_start
=
2978
_globals
[
'_EMBEDREQUEST'
].
_serialized_end
=
3308
_globals
[
'_EMBEDRESPONSE'
].
_serialized_start
=
3311
_globals
[
'_EMBEDRESPONSE'
].
_serialized_end
=
3468
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_start
=
3471
_globals
[
'_EMBEDCOMPLETE'
].
_serialized_end
=
3659
_globals
[
'_EMBEDDING'
].
_serialized_start
=
3661
_globals
[
'_EMBEDDING'
].
_serialized_end
=
3703
_globals
[
'_EMBEDERROR'
].
_serialized_start
=
3705
_globals
[
'_EMBEDERROR'
].
_serialized_end
=
3765
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_start
=
3767
_globals
[
'_HEALTHCHECKREQUEST'
].
_serialized_end
=
3845
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_start
=
3847
_globals
[
'_HEALTHCHECKRESPONSE'
].
_serialized_end
=
3902
_globals
[
'_ABORTREQUEST'
].
_serialized_start
=
3904
_globals
[
'_ABORTREQUEST'
].
_serialized_end
=
3954
_globals
[
'_ABORTRESPONSE'
].
_serialized_start
=
3956
_globals
[
'_ABORTRESPONSE'
].
_serialized_end
=
4005
_globals
[
'_LOADLORAREQUEST'
].
_serialized_start
=
4007
_globals
[
'_LOADLORAREQUEST'
].
_serialized_end
=
4080
_globals
[
'_LOADLORARESPONSE'
].
_serialized_start
=
4082
_globals
[
'_LOADLORARESPONSE'
].
_serialized_end
=
4154
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_start
=
4156
_globals
[
'_UNLOADLORAREQUEST'
].
_serialized_end
=
4195
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_start
=
4197
_globals
[
'_UNLOADLORARESPONSE'
].
_serialized_end
=
4251
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_start
=
4253
_globals
[
'_UPDATEWEIGHTSREQUEST'
].
_serialized_end
=
4372
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_start
=
4374
_globals
[
'_UPDATEWEIGHTSRESPONSE'
].
_serialized_end
=
4431
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_start
=
4433
_globals
[
'_GETINTERNALSTATEREQUEST'
].
_serialized_end
=
4478
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_start
=
4480
_globals
[
'_GETINTERNALSTATERESPONSE'
].
_serialized_end
=
4546
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_start
=
4548
_globals
[
'_SETINTERNALSTATEREQUEST'
].
_serialized_end
=
4613
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_start
=
4615
_globals
[
'_SETINTERNALSTATERESPONSE'
].
_serialized_end
=
4675
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_start
=
4678
_globals
[
'_SGLANGSCHEDULER'
].
_serialized_end
=
5060
# @@protoc_insertion_point(module_scope)
python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
0 → 100644
View file @
53ca1552
This diff is collapsed.
Click to expand it.
python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
0 → 100644
View file @
53ca1552
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import
grpc
import
warnings
from
.
import
sglang_scheduler_pb2
as
sglang__scheduler__pb2
GRPC_GENERATED_VERSION
=
'1.74.0'
GRPC_VERSION
=
grpc
.
__version__
_version_not_supported
=
False
try
:
from
grpc._utilities
import
first_version_is_lower
_version_not_supported
=
first_version_is_lower
(
GRPC_VERSION
,
GRPC_GENERATED_VERSION
)
except
ImportError
:
_version_not_supported
=
True
if
_version_not_supported
:
raise
RuntimeError
(
f
'The grpc package installed is at version
{
GRPC_VERSION
}
,'
+
f
' but the generated code in sglang_scheduler_pb2_grpc.py depends on'
+
f
' grpcio>=
{
GRPC_GENERATED_VERSION
}
.'
+
f
' Please upgrade your grpc module to grpcio>=
{
GRPC_GENERATED_VERSION
}
'
+
f
' or downgrade your generated code using grpcio-tools<=
{
GRPC_VERSION
}
.'
)
class
SglangSchedulerStub
(
object
):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
def
__init__
(
self
,
channel
):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self
.
Generate
=
channel
.
unary_stream
(
'/sglang.grpc.scheduler.SglangScheduler/Generate'
,
request_serializer
=
sglang__scheduler__pb2
.
GenerateRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
GenerateResponse
.
FromString
,
_registered_method
=
True
)
self
.
Embed
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/Embed'
,
request_serializer
=
sglang__scheduler__pb2
.
EmbedRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
EmbedResponse
.
FromString
,
_registered_method
=
True
)
self
.
HealthCheck
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/HealthCheck'
,
request_serializer
=
sglang__scheduler__pb2
.
HealthCheckRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
HealthCheckResponse
.
FromString
,
_registered_method
=
True
)
self
.
Abort
=
channel
.
unary_unary
(
'/sglang.grpc.scheduler.SglangScheduler/Abort'
,
request_serializer
=
sglang__scheduler__pb2
.
AbortRequest
.
SerializeToString
,
response_deserializer
=
sglang__scheduler__pb2
.
AbortResponse
.
FromString
,
_registered_method
=
True
)
class
SglangSchedulerServicer
(
object
):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
def
Generate
(
self
,
request
,
context
):
"""Submit a generation request (supports streaming)
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
Embed
(
self
,
request
,
context
):
"""Submit an embedding request
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
HealthCheck
(
self
,
request
,
context
):
"""Health check and metrics
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
Abort
(
self
,
request
,
context
):
"""Abort a running request
"""
context
.
set_code
(
grpc
.
StatusCode
.
UNIMPLEMENTED
)
context
.
set_details
(
'Method not implemented!'
)
raise
NotImplementedError
(
'Method not implemented!'
)
def
add_SglangSchedulerServicer_to_server
(
servicer
,
server
):
rpc_method_handlers
=
{
'Generate'
:
grpc
.
unary_stream_rpc_method_handler
(
servicer
.
Generate
,
request_deserializer
=
sglang__scheduler__pb2
.
GenerateRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
GenerateResponse
.
SerializeToString
,
),
'Embed'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
Embed
,
request_deserializer
=
sglang__scheduler__pb2
.
EmbedRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
EmbedResponse
.
SerializeToString
,
),
'HealthCheck'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
HealthCheck
,
request_deserializer
=
sglang__scheduler__pb2
.
HealthCheckRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
HealthCheckResponse
.
SerializeToString
,
),
'Abort'
:
grpc
.
unary_unary_rpc_method_handler
(
servicer
.
Abort
,
request_deserializer
=
sglang__scheduler__pb2
.
AbortRequest
.
FromString
,
response_serializer
=
sglang__scheduler__pb2
.
AbortResponse
.
SerializeToString
,
),
}
generic_handler
=
grpc
.
method_handlers_generic_handler
(
'sglang.grpc.scheduler.SglangScheduler'
,
rpc_method_handlers
)
server
.
add_generic_rpc_handlers
((
generic_handler
,))
server
.
add_registered_method_handlers
(
'sglang.grpc.scheduler.SglangScheduler'
,
rpc_method_handlers
)
# This class is part of an EXPERIMENTAL API.
class
SglangScheduler
(
object
):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
@
staticmethod
def
Generate
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_stream
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/Generate'
,
sglang__scheduler__pb2
.
GenerateRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
GenerateResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
Embed
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/Embed'
,
sglang__scheduler__pb2
.
EmbedRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
EmbedResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
HealthCheck
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/HealthCheck'
,
sglang__scheduler__pb2
.
HealthCheckRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
HealthCheckResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
@
staticmethod
def
Abort
(
request
,
target
,
options
=
(),
channel_credentials
=
None
,
call_credentials
=
None
,
insecure
=
False
,
compression
=
None
,
wait_for_ready
=
None
,
timeout
=
None
,
metadata
=
None
):
return
grpc
.
experimental
.
unary_unary
(
request
,
target
,
'/sglang.grpc.scheduler.SglangScheduler/Abort'
,
sglang__scheduler__pb2
.
AbortRequest
.
SerializeToString
,
sglang__scheduler__pb2
.
AbortResponse
.
FromString
,
options
,
channel_credentials
,
insecure
,
call_credentials
,
compression
,
wait_for_ready
,
timeout
,
metadata
,
_registered_method
=
True
)
python/sglang/srt/server_args.py
View file @
53ca1552
...
...
@@ -2238,6 +2238,7 @@ class ServerArgs:
args
.
pp_size
=
args
.
pipeline_parallel_size
args
.
dp_size
=
args
.
data_parallel_size
args
.
ep_size
=
args
.
expert_parallel_size
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
return
cls
(
**
{
attr
:
getattr
(
args
,
attr
)
for
attr
in
attrs
})
...
...
sgl-router/src/grpc/client.rs
View file @
53ca1552
...
...
@@ -37,21 +37,6 @@ impl SglangSchedulerClient {
Ok
(
Self
{
client
})
}
/// Initialize the connection
pub
async
fn
initialize
(
&
mut
self
,
client_id
:
String
,
)
->
Result
<
proto
::
InitializeResponse
,
Box
<
dyn
std
::
error
::
Error
>>
{
let
request
=
Request
::
new
(
proto
::
InitializeRequest
{
client_id
,
client_version
:
"0.1.0"
.to_string
(),
mode
:
proto
::
initialize_request
::
Mode
::
Regular
as
i32
,
});
let
response
=
self
.client
.initialize
(
request
)
.await
?
;
Ok
(
response
.into_inner
())
}
/// Submit a generation request (returns streaming response)
pub
async
fn
generate_stream
(
&
mut
self
,
...
...
@@ -68,7 +53,10 @@ impl SglangSchedulerClient {
)
->
Result
<
proto
::
HealthCheckResponse
,
Box
<
dyn
std
::
error
::
Error
>>
{
debug!
(
"Sending health check request"
);
let
request
=
Request
::
new
(
proto
::
HealthCheckRequest
{
include_detailed_metrics
:
false
,
tokenized
:
Some
(
proto
::
TokenizedInput
{
original_text
:
"Hello"
.to_string
(),
input_ids
:
vec!
[
9906
],
// Mock token ID for "Hello"
}),
});
let
response
=
self
.client
.health_check
(
request
)
.await
?
;
...
...
@@ -87,21 +75,6 @@ impl SglangSchedulerClient {
self
.client
.abort
(
request
)
.await
?
;
Ok
(())
}
/// Flush cache
pub
async
fn
flush_cache
(
&
mut
self
,
flush_all
:
bool
,
session_ids
:
&
[
String
],
)
->
Result
<
proto
::
FlushCacheResponse
,
Box
<
dyn
std
::
error
::
Error
>>
{
let
request
=
Request
::
new
(
proto
::
FlushCacheRequest
{
flush_all
,
session_ids
:
session_ids
.to_vec
(),
});
let
response
=
self
.client
.flush_cache
(
request
)
.await
?
;
Ok
(
response
.into_inner
())
}
}
#[cfg(test)]
...
...
@@ -111,14 +84,13 @@ mod tests {
#[test]
fn
test_proto_types_compilation
()
{
// Test that protobuf types can be constructed
let
init_req
=
proto
::
InitializeRequest
{
client_id
:
"test-client"
.to_string
(),
client_version
:
"0.1.0"
.to_string
(),
mode
:
0
,
let
health_req
=
proto
::
HealthCheckRequest
{
tokenized
:
Some
(
proto
::
TokenizedInput
{
original_text
:
"test"
.to_string
(),
input_ids
:
vec!
[
1296
],
}),
};
assert_eq!
(
init_req
.client_id
,
"test-client"
);
assert_eq!
(
init_req
.client_version
,
"0.1.0"
);
assert_eq!
(
init_req
.mode
,
0
);
assert
!
(
health_req
.tokenized
.is_some
());
}
#[test]
...
...
@@ -134,9 +106,10 @@ mod tests {
let
gen_req
=
proto
::
GenerateRequest
{
request_id
:
"test-req-123"
.to_string
(),
input
:
Some
(
proto
::
generate_request
::
Input
::
Text
(
"Hello world"
.to_string
(),
)),
tokenized
:
Some
(
proto
::
TokenizedInput
{
original_text
:
"Hello world"
.to_string
(),
input_ids
:
vec!
[
9906
,
1917
],
// Mock token IDs for "Hello world"
}),
sampling_params
:
Some
(
sampling_params
),
return_logprob
:
true
,
logprob_start_len
:
0
,
...
...
@@ -145,8 +118,8 @@ mod tests {
};
assert_eq!
(
gen_req
.request_id
,
"test-req-123"
);
if
let
Some
(
proto
::
generate_request
::
Input
::
Text
(
text
)
)
=
&
gen_req
.
input
{
assert_eq!
(
text
,
"Hello world"
);
if
let
Some
(
ref
tokenized
)
=
&
gen_req
.
tokenized
{
assert_eq!
(
tokenized
.original_
text
,
"Hello world"
);
}
assert
!
(
gen_req
.return_logprob
);
assert_eq!
(
gen_req
.top_logprobs_num
,
5
);
...
...
@@ -160,9 +133,12 @@ mod tests {
#[test]
fn
test_health_check_request
()
{
let
health_req
=
proto
::
HealthCheckRequest
{
include_detailed_metrics
:
true
,
tokenized
:
Some
(
proto
::
TokenizedInput
{
original_text
:
"test"
.to_string
(),
input_ids
:
vec!
[
1296
],
// Mock token ID for "test"
}),
};
assert
!
(
health_req
.
include_detailed_metrics
);
assert
!
(
health_req
.
tokenized
.is_some
()
);
}
#[test]
...
...
@@ -175,17 +151,6 @@ mod tests {
assert_eq!
(
abort_req
.reason
,
"User canceled"
);
}
#[test]
fn
test_flush_cache_request
()
{
let
flush_req
=
proto
::
FlushCacheRequest
{
flush_all
:
true
,
session_ids
:
vec!
[
"session1"
.to_string
(),
"session2"
.to_string
()],
};
assert
!
(
flush_req
.flush_all
);
assert_eq!
(
flush_req
.session_ids
.len
(),
2
);
assert_eq!
(
flush_req
.session_ids
[
0
],
"session1"
);
}
#[test]
fn
test_sampling_params_defaults
()
{
let
params
=
proto
::
SamplingParams
::
default
();
...
...
@@ -214,38 +179,29 @@ mod tests {
assert_eq!
(
mm_inputs
.modalities
[
0
],
"image"
);
}
#[test]
fn
test_session_params
()
{
let
session_params
=
proto
::
SessionParams
{
session_id
:
"sess-789"
.to_string
(),
request_id
:
"req-101"
.to_string
(),
offset
:
100
,
replace
:
true
,
drop_previous_output
:
false
,
};
assert_eq!
(
session_params
.session_id
,
"sess-789"
);
assert_eq!
(
session_params
.request_id
,
"req-101"
);
assert_eq!
(
session_params
.offset
,
100
);
assert
!
(
session_params
.replace
);
assert
!
(
!
session_params
.drop_previous_output
);
}
// TODO: SessionParams not in current proto - skip test
// #[test]
// fn test_session_params() { ... }
#[test]
fn
test_embed_request
()
{
let
embed_req
=
proto
::
EmbedRequest
{
request_id
:
"embed-req-202"
.to_string
(),
input
:
Some
(
proto
::
embed_request
::
Input
::
Text
(
"This is a test sentence for embedding"
.to_string
(),
)),
tokenized
:
Some
(
proto
::
TokenizedInput
{
original_text
:
"This is a test sentence for embedding"
.to_string
(),
input_ids
:
vec!
[
2028
,
374
,
264
,
1296
,
11914
,
369
,
28537
],
// Mock token IDs
}),
log_metrics
:
true
,
data_parallel_rank
:
0
,
..
Default
::
default
()
};
assert_eq!
(
embed_req
.request_id
,
"embed-req-202"
);
if
let
Some
(
proto
::
embed_request
::
Input
::
Text
(
text
))
=
&
embed_req
.input
{
assert_eq!
(
text
,
"This is a test sentence for embedding"
);
if
let
Some
(
ref
tokenized
)
=
&
embed_req
.tokenized
{
assert_eq!
(
tokenized
.original_text
,
"This is a test sentence for embedding"
);
}
assert
!
(
embed_req
.log_metrics
);
assert_eq!
(
embed_req
.data_parallel_rank
,
0
);
...
...
@@ -292,36 +248,7 @@ mod tests {
assert_eq!
(
chunk
.queue_time
,
10
);
}
#[test]
fn
test_model_info
()
{
let
model_info
=
proto
::
ModelInfo
{
model_name
:
"Meta-Llama-3-8B-Instruct"
.to_string
(),
max_context_length
:
8192
,
vocab_size
:
128256
,
supports_tool_calling
:
true
,
supports_vision
:
false
,
special_tokens
:
vec!
[
"<|begin_of_text|>"
.to_string
(),
"<|end_of_text|>"
.to_string
(),
],
model_type
:
"llama"
.to_string
(),
num_layers
:
32
,
hidden_size
:
4096
,
num_attention_heads
:
32
,
num_key_value_heads
:
8
,
tokenizer_type
:
"llama"
.to_string
(),
eos_token_ids
:
vec!
[
128001
,
128009
],
pad_token_id
:
128001
,
bos_token_id
:
128000
,
};
assert_eq!
(
model_info
.model_name
,
"Meta-Llama-3-8B-Instruct"
);
assert_eq!
(
model_info
.max_context_length
,
8192
);
assert_eq!
(
model_info
.vocab_size
,
128256
);
assert
!
(
model_info
.supports_tool_calling
);
assert
!
(
!
model_info
.supports_vision
);
assert_eq!
(
model_info
.special_tokens
.len
(),
2
);
assert_eq!
(
model_info
.num_layers
,
32
);
assert_eq!
(
model_info
.eos_token_ids
,
vec!
[
128001
,
128009
]);
}
// TODO: ModelInfo not in current proto - skip test
// #[test]
// fn test_model_info() { ... }
}
sgl-router/src/proto/sglang_scheduler.proto
View file @
53ca1552
...
...
@@ -8,9 +8,6 @@ import "google/protobuf/struct.proto";
// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
service
SglangScheduler
{
// Initialize connection and get model info
rpc
Initialize
(
InitializeRequest
)
returns
(
InitializeResponse
);
// Submit a generation request (supports streaming)
rpc
Generate
(
GenerateRequest
)
returns
(
stream
GenerateResponse
);
...
...
@@ -23,8 +20,6 @@ service SglangScheduler {
// Abort a running request
rpc
Abort
(
AbortRequest
)
returns
(
AbortResponse
);
// Flush KV cache
rpc
FlushCache
(
FlushCacheRequest
)
returns
(
FlushCacheResponse
);
}
// =====================
...
...
@@ -75,14 +70,6 @@ message SamplingParams {
google.protobuf.Struct
custom_params
=
25
;
}
// Session parameters for continual prompting
message
SessionParams
{
string
session_id
=
1
;
string
request_id
=
2
;
int32
offset
=
3
;
bool
replace
=
4
;
bool
drop_previous_output
=
5
;
}
// Disaggregated serving parameters
message
DisaggregatedParams
{
...
...
@@ -91,87 +78,6 @@ message DisaggregatedParams {
int32
bootstrap_room
=
3
;
}
// =====================
// Initialize
// =====================
message
InitializeRequest
{
string
client_id
=
1
;
string
client_version
=
2
;
// Operating mode
enum
Mode
{
REGULAR
=
0
;
// Normal mode with local scheduler
PREFILL
=
1
;
// Prefill-only mode for disaggregated serving
DECODE
=
2
;
// Decode-only mode for disaggregated serving
}
Mode
mode
=
3
;
}
message
InitializeResponse
{
bool
success
=
1
;
string
scheduler_version
=
2
;
// Model information
ModelInfo
model_info
=
3
;
// Server capabilities
ServerCapabilities
capabilities
=
4
;
// Error message if success is false
string
error_message
=
5
;
}
message
ModelInfo
{
string
model_name
=
1
;
int32
max_context_length
=
2
;
int32
vocab_size
=
3
;
bool
supports_tool_calling
=
4
;
bool
supports_vision
=
5
;
repeated
string
special_tokens
=
6
;
// Additional model metadata
string
model_type
=
7
;
int32
num_layers
=
8
;
int32
hidden_size
=
9
;
int32
num_attention_heads
=
10
;
int32
num_key_value_heads
=
11
;
// Tokenizer info
string
tokenizer_type
=
12
;
repeated
int32
eos_token_ids
=
13
;
int32
pad_token_id
=
14
;
int32
bos_token_id
=
15
;
}
message
ServerCapabilities
{
bool
continuous_batching
=
1
;
bool
disaggregated_serving
=
2
;
bool
speculative_decoding
=
3
;
int32
max_batch_size
=
4
;
int32
max_num_batched_tokens
=
5
;
int32
max_prefill_tokens
=
6
;
string
attention_backend
=
7
;
// "flashinfer", "triton", "torch"
// Additional capabilities
bool
supports_lora
=
8
;
bool
supports_grammar
=
9
;
bool
supports_multimodal
=
10
;
repeated
string
supported_modalities
=
11
;
// ["image", "video", "audio"]
bool
supports_custom_logit_processor
=
12
;
bool
supports_session
=
13
;
// Hardware info
int32
num_gpus
=
14
;
string
gpu_type
=
15
;
int64
total_gpu_memory
=
16
;
// Parallelism info
int32
tensor_parallel_size
=
17
;
int32
pipeline_parallel_size
=
18
;
int32
data_parallel_size
=
19
;
}
// =====================
// Generate Request
// =====================
...
...
@@ -179,49 +85,43 @@ message ServerCapabilities {
message
GenerateRequest
{
string
request_id
=
1
;
// Input can be either text or tokenized
oneof
input
{
string
text
=
2
;
TokenizedInput
tokenized
=
3
;
}
// Input must be tokenized (no raw text)
TokenizedInput
tokenized
=
2
;
// Multimodal inputs
MultimodalInputs
mm_inputs
=
4
;
MultimodalInputs
mm_inputs
=
3
;
// Generation parameters
SamplingParams
sampling_params
=
5
;
SamplingParams
sampling_params
=
4
;
// Return options
bool
return_logprob
=
6
;
int32
logprob_start_len
=
7
;
int32
top_logprobs_num
=
8
;
repeated
int32
token_ids_logprob
=
9
;
bool
return_hidden_states
=
10
;
// Session management
SessionParams
session_params
=
11
;
bool
return_logprob
=
5
;
int32
logprob_start_len
=
6
;
int32
top_logprobs_num
=
7
;
repeated
int32
token_ids_logprob
=
8
;
bool
return_hidden_states
=
9
;
// For disaggregated serving
DisaggregatedParams
disaggregated_params
=
1
2
;
DisaggregatedParams
disaggregated_params
=
1
0
;
// Custom logit processor (serialized)
string
custom_logit_processor
=
1
3
;
string
custom_logit_processor
=
1
1
;
// Request metadata
google.protobuf.Timestamp
timestamp
=
1
4
;
bool
log_metrics
=
1
5
;
google.protobuf.Timestamp
timestamp
=
1
2
;
bool
log_metrics
=
1
3
;
// Input embeddings (alternative to text/tokens)
repeated
float
input_embeds
=
1
6
;
repeated
float
input_embeds
=
1
4
;
// LoRA adapter ID (if pre-loaded)
string
lora_id
=
1
7
;
string
lora_id
=
1
5
;
// Data parallel routing
int32
data_parallel_rank
=
1
8
;
int32
data_parallel_rank
=
1
6
;
// For load balancing
int32
dp_balance_id
=
1
9
;
int32
dp_balance_id
=
1
7
;
}
message
TokenizedInput
{
...
...
@@ -303,19 +203,6 @@ message GenerateComplete {
}
FinishReason
finish_reason
=
3
;
// Final counts
int32
prompt_tokens
=
4
;
int32
completion_tokens
=
5
;
int32
cached_tokens
=
6
;
// Performance metrics
float
total_generation_time
=
7
;
float
time_to_first_token
=
8
;
float
tokens_per_second
=
9
;
// Spec decode metrics
int32
spec_verify_count
=
10
;
// All logprobs if requested
repeated
LogProbs
all_logprobs
=
11
;
...
...
@@ -359,10 +246,8 @@ message HiddenStates {
message
EmbedRequest
{
string
request_id
=
1
;
oneof
input
{
string
text
=
2
;
TokenizedInput
tokenized
=
3
;
}
// Input must be tokenized (no raw text)
TokenizedInput
tokenized
=
2
;
// Multimodal inputs
MultimodalInputs
mm_inputs
=
4
;
...
...
@@ -422,39 +307,13 @@ message EmbedError {
// =====================
message
HealthCheckRequest
{
bool
include_detailed_metrics
=
1
;
// Input for health test generation (must be tokenized)
TokenizedInput
tokenized
=
1
;
}
message
HealthCheckResponse
{
bool
healthy
=
1
;
// Current load metrics
int32
num_requests_running
=
2
;
int32
num_requests_waiting
=
3
;
float
gpu_cache_usage
=
4
;
float
gpu_memory_usage
=
5
;
// KV cache metrics
int32
kv_cache_total_blocks
=
6
;
int32
kv_cache_used_blocks
=
7
;
float
kv_cache_hit_rate
=
8
;
// Additional metrics
int32
num_grammar_queue_requests
=
9
;
float
generation_throughput
=
10
;
// tokens/sec
float
average_queue_time
=
11
;
// seconds
float
average_generation_time
=
12
;
// seconds
// System metrics
float
cpu_usage
=
13
;
int64
memory_usage
=
14
;
// Disaggregation metrics
int32
num_prefill_requests
=
15
;
int32
num_decode_requests
=
16
;
// Detailed metrics (optional)
google.protobuf.Struct
detailed_metrics
=
17
;
string
message
=
2
;
}
message
AbortRequest
{
...
...
@@ -467,17 +326,6 @@ message AbortResponse {
string
message
=
2
;
}
message
FlushCacheRequest
{
bool
flush_all
=
1
;
repeated
string
session_ids
=
2
;
// Flush specific sessions
}
message
FlushCacheResponse
{
bool
success
=
1
;
int32
num_entries_flushed
=
2
;
int64
memory_freed
=
3
;
// bytes
string
message
=
4
;
}
// =====================
// Additional Operations (Future)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment