".github/vscode:/vscode.git/clone" did not exist on "7009080014c65aeef4616d5fd9c2459c916e2e7a"
Unverified Commit 53ca1552 authored by Chang Su's avatar Chang Su Committed by GitHub
Browse files

Implement Standalone gRPC Server for SGLang Python Scheduler (#10283)

parent a23bdeaf
...@@ -22,17 +22,19 @@ repos: ...@@ -22,17 +22,19 @@ repos:
rev: 5.13.2 rev: 5.13.2
hooks: hooks:
- id: isort - id: isort
exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.11.7
hooks: hooks:
- id: ruff - id: ruff
args: [--select=F401, --fixable=F401] args: [--select=F401, --fixable=F401]
files: ^(benchmark/|docs/|examples/) files: ^(benchmark/|docs/|examples/)
exclude: \.ipynb$ exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
rev: 24.10.0 rev: 24.10.0
hooks: hooks:
- id: black-jupyter - id: black-jupyter
exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.1 rev: v2.4.1
hooks: hooks:
...@@ -42,7 +44,11 @@ repos: ...@@ -42,7 +44,11 @@ repos:
exclude: | exclude: |
(?x)^( (?x)^(
test/srt/test_reasoning_parser\.py| test/srt/test_reasoning_parser\.py|
docs/advanced_features/vlm_query\.ipynb docs/advanced_features/vlm_query\.ipynb|
python/sglang/srt/grpc/.*_pb2\.py|
python/sglang/srt/grpc/.*_pb2_grpc\.py|
python/sglang/srt/grpc/.*_pb2\.pyi|
python/sglang/srt/grpc/.*_pb2_grpc\.pyi
)$ )$
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.8 rev: v18.1.8
......
This diff is collapsed.
This diff is collapsed.
syntax = "proto3";
package sglang.grpc.scheduler;
import "google/protobuf/timestamp.proto";
import "google/protobuf/struct.proto";
// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
service SglangScheduler {
// Submit a generation request (supports streaming)
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
// Submit an embedding request
rpc Embed(EmbedRequest) returns (EmbedResponse);
// Health check and metrics
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
// Abort a running request
rpc Abort(AbortRequest) returns (AbortResponse);
}
// =====================
// Common Types
// =====================
// Sampling parameters matching SGLang's SamplingParams
message SamplingParams {
float temperature = 1;
float top_p = 2;
int32 top_k = 3;
float min_p = 4;
float frequency_penalty = 5;
float presence_penalty = 6;
float repetition_penalty = 7;
int32 max_new_tokens = 8;
repeated string stop = 9;
repeated int32 stop_token_ids = 10;
bool skip_special_tokens = 11;
bool spaces_between_special_tokens = 12;
// Structured generation
oneof constraint {
string regex = 13;
string json_schema = 14;
string ebnf_grammar = 15;
}
// LoRA adapter
string lora_path = 16;
// Speculative decoding
int32 n = 17; // Number of samples
// Token healing
bool token_healing = 18;
// Additional parameters
int32 min_new_tokens = 19;
bool ignore_eos = 20;
bool no_stop_trim = 21;
int32 stream_interval = 22;
map<string, float> logit_bias = 23;
string structural_tag = 24;
// Custom parameters for extensibility
google.protobuf.Struct custom_params = 25;
}
// Disaggregated serving parameters
message DisaggregatedParams {
string bootstrap_host = 1;
int32 bootstrap_port = 2;
int32 bootstrap_room = 3;
}
// =====================
// Generate Request
// =====================
message GenerateRequest {
string request_id = 1;
// Input must be tokenized (no raw text)
TokenizedInput tokenized = 2;
// Multimodal inputs
MultimodalInputs mm_inputs = 3;
// Generation parameters
SamplingParams sampling_params = 4;
// Return options
bool return_logprob = 5;
int32 logprob_start_len = 6;
int32 top_logprobs_num = 7;
repeated int32 token_ids_logprob = 8;
bool return_hidden_states = 9;
// For disaggregated serving
DisaggregatedParams disaggregated_params = 10;
// Custom logit processor (serialized)
string custom_logit_processor = 11;
// Request metadata
google.protobuf.Timestamp timestamp = 12;
bool log_metrics = 13;
// Input embeddings (alternative to text/tokens)
repeated float input_embeds = 14;
// LoRA adapter ID (if pre-loaded)
string lora_id = 15;
// Data parallel routing
int32 data_parallel_rank = 16;
// For load balancing
int32 dp_balance_id = 17;
}
message TokenizedInput {
string original_text = 1; // For reference
repeated int32 input_ids = 2;
}
message MultimodalInputs {
// Simplified multimodal handling - actual data processed by tokenizer
repeated string image_urls = 1;
repeated string video_urls = 2;
repeated string audio_urls = 3;
// Pre-processed multimodal features (if available)
google.protobuf.Struct processed_features = 4;
// Raw data for direct processing
repeated bytes image_data = 5;
repeated bytes video_data = 6;
repeated bytes audio_data = 7;
// Modality metadata
repeated string modalities = 8;
}
// =====================
// Generate Response
// =====================
message GenerateResponse {
string request_id = 1;
// Response type
oneof response {
GenerateStreamChunk chunk = 2;
GenerateComplete complete = 3;
GenerateError error = 4;
}
}
message GenerateStreamChunk {
// Generated token
int32 token_id = 1;
string text = 2;
// Cumulative counts
int32 prompt_tokens = 3;
int32 completion_tokens = 4;
int32 cached_tokens = 5;
// Logprobs (if requested)
LogProbs logprobs = 6;
// Hidden states (if requested)
repeated float hidden_states = 7;
// Metadata
float generation_time = 8; // Time to generate this token
int32 queue_time = 9; // Time spent in queue
}
message GenerateComplete {
// Final output
repeated int32 output_ids = 1;
string output_text = 2;
// Finish reason
enum FinishReason {
// The model generated a stop sequence.
STOP = 0;
// The model reached the maximum generation length.
LENGTH = 1;
// The model generated an end-of-sequence (EOS) token.
EOS_TOKEN = 2;
// The model generated a user-provided stop string.
STOP_STR = 3;
// The request was aborted by the user or system.
ABORT = 4;
}
FinishReason finish_reason = 3;
// All logprobs if requested
repeated LogProbs all_logprobs = 11;
// All hidden states if requested
repeated HiddenStates all_hidden_states = 12;
}
message GenerateError {
string message = 1;
string http_status_code = 2;
string details = 3;
}
message LogProbs {
repeated float token_logprobs = 1;
repeated int32 token_ids = 2;
// Top logprobs at each position
repeated TopLogProbs top_logprobs = 3;
// Decoded text for tokens
repeated string token_texts = 4;
}
message TopLogProbs {
repeated float values = 1;
repeated int32 token_ids = 2;
repeated string token_texts = 3;
}
message HiddenStates {
repeated float values = 1;
int32 layer = 2;
int32 position = 3;
}
// =====================
// Embedding Request
// =====================
message EmbedRequest {
string request_id = 1;
// Input must be tokenized (no raw text)
TokenizedInput tokenized = 2;
// Multimodal inputs
MultimodalInputs mm_inputs = 4;
// Dummy sampling params for compatibility
// EmbedRequest doesn't use sampling_params
SamplingParams sampling_params = 5;
bool log_metrics = 6;
// Token type IDs for models that require them
repeated int32 token_type_ids = 7;
// Data parallel routing
int32 data_parallel_rank = 8;
// For cross-encoder requests
bool is_cross_encoder = 9;
repeated string texts = 10; // For cross-encoder batch
}
message EmbedResponse {
string request_id = 1;
oneof response {
EmbedComplete complete = 2;
EmbedError error = 3;
}
}
message EmbedComplete {
repeated float embedding = 1;
int32 prompt_tokens = 2;
int32 cached_tokens = 3;
// Additional metadata
int32 embedding_dim = 4;
float generation_time = 5;
// For batch embeddings
repeated Embedding batch_embeddings = 6;
}
message Embedding {
repeated float values = 1;
int32 index = 2;
}
message EmbedError {
string message = 1;
string code = 2;
string details = 3;
}
// =====================
// Management Operations
// =====================
message HealthCheckRequest {
// Input for health test generation (must be tokenized)
TokenizedInput tokenized = 1;
}
message HealthCheckResponse {
bool healthy = 1;
string message = 2;
}
message AbortRequest {
string request_id = 1;
string reason = 2;
}
message AbortResponse {
bool success = 1;
string message = 2;
}
// =====================
// Additional Operations (Future)
// =====================
// Load LoRA adapter
message LoadLoRARequest {
string adapter_id = 1;
string adapter_path = 2;
int32 rank = 3;
}
message LoadLoRAResponse {
bool success = 1;
string adapter_id = 2;
string message = 3;
}
// Unload LoRA adapter
message UnloadLoRARequest {
string adapter_id = 1;
}
message UnloadLoRAResponse {
bool success = 1;
string message = 2;
}
// Update weights
message UpdateWeightsRequest {
oneof source {
string disk_path = 1;
bytes tensor_data = 2;
string remote_url = 3;
}
string weight_name = 4;
}
message UpdateWeightsResponse {
bool success = 1;
string message = 2;
}
// Get internal state for debugging
message GetInternalStateRequest {
repeated string state_keys = 1;
}
message GetInternalStateResponse {
google.protobuf.Struct state = 1;
}
// Set internal state for testing
message SetInternalStateRequest {
google.protobuf.Struct state = 1;
}
message SetInternalStateResponse {
bool success = 1;
string message = 2;
}
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: sglang_scheduler.proto
# Protobuf Python Version: 6.31.1
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import runtime_version as _runtime_version
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
_runtime_version.ValidateProtobufRuntimeVersion(
_runtime_version.Domain.PUBLIC,
6,
31,
1,
'',
'sglang_scheduler.proto'
)
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xc7\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x16\n\x0emax_new_tokens\x18\x08 \x01(\x05\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\x05\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x11\n\tlora_path\x18\x10 \x01(\t\x12\t\n\x01n\x18\x11 \x01(\x05\x12\x15\n\rtoken_healing\x18\x12 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x13 \x01(\x05\x12\x12\n\nignore_eos\x18\x14 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x15 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x16 \x01(\x05\x12H\n\nlogit_bias\x18\x17 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12\x16\n\x0estructural_tag\x18\x18 \x01(\t\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraint\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xe9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\x05\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\x05\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xf5\x01\n\x13GenerateStreamChunk\x12\x10\n\x08token_id\x18\x01 \x01(\x05\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x31\n\x08logprobs\x18\x06 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x07 \x03(\x02\x12\x17\n\x0fgeneration_time\x18\x08 \x01(\x02\x12\x12\n\nqueue_time\x18\t \x01(\x05\"\xcd\x02\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\x05\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12K\n\rfinish_reason\x18\x03 \x01(\x0e\x32\x34.sglang.grpc.scheduler.GenerateComplete.FinishReason\x12\x35\n\x0c\x61ll_logprobs\x18\x0b \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x0c \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\"L\n\x0c\x46inishReason\x12\x08\n\x04STOP\x10\x00\x12\n\n\x06LENGTH\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\x0c\n\x08STOP_STR\x10\x03\x12\t\n\x05\x41\x42ORT\x10\x04\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xbc\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12\x17\n\x0fgeneration_time\x18\x05 \x01(\x02\x12:\n\x10\x62\x61tch_embeddings\x18\x06 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sglang_scheduler_pb2', _globals)
if not _descriptor._USE_C_DESCRIPTORS:
DESCRIPTOR._loaded_options = None
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001'
_globals['_SAMPLINGPARAMS']._serialized_start=113
_globals['_SAMPLINGPARAMS']._serialized_end=824
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=762
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=810
_globals['_DISAGGREGATEDPARAMS']._serialized_start=826
_globals['_DISAGGREGATEDPARAMS']._serialized_end=919
_globals['_GENERATEREQUEST']._serialized_start=922
_globals['_GENERATEREQUEST']._serialized_end=1539
_globals['_TOKENIZEDINPUT']._serialized_start=1541
_globals['_TOKENIZEDINPUT']._serialized_end=1599
_globals['_MULTIMODALINPUTS']._serialized_start=1602
_globals['_MULTIMODALINPUTS']._serialized_end=1813
_globals['_GENERATERESPONSE']._serialized_start=1816
_globals['_GENERATERESPONSE']._serialized_end=2043
_globals['_GENERATESTREAMCHUNK']._serialized_start=2046
_globals['_GENERATESTREAMCHUNK']._serialized_end=2291
_globals['_GENERATECOMPLETE']._serialized_start=2294
_globals['_GENERATECOMPLETE']._serialized_end=2627
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_start=2551
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_end=2627
_globals['_GENERATEERROR']._serialized_start=2629
_globals['_GENERATEERROR']._serialized_end=2704
_globals['_LOGPROBS']._serialized_start=2707
_globals['_LOGPROBS']._serialized_end=2839
_globals['_TOPLOGPROBS']._serialized_start=2841
_globals['_TOPLOGPROBS']._serialized_end=2910
_globals['_HIDDENSTATES']._serialized_start=2912
_globals['_HIDDENSTATES']._serialized_end=2975
_globals['_EMBEDREQUEST']._serialized_start=2978
_globals['_EMBEDREQUEST']._serialized_end=3308
_globals['_EMBEDRESPONSE']._serialized_start=3311
_globals['_EMBEDRESPONSE']._serialized_end=3468
_globals['_EMBEDCOMPLETE']._serialized_start=3471
_globals['_EMBEDCOMPLETE']._serialized_end=3659
_globals['_EMBEDDING']._serialized_start=3661
_globals['_EMBEDDING']._serialized_end=3703
_globals['_EMBEDERROR']._serialized_start=3705
_globals['_EMBEDERROR']._serialized_end=3765
_globals['_HEALTHCHECKREQUEST']._serialized_start=3767
_globals['_HEALTHCHECKREQUEST']._serialized_end=3845
_globals['_HEALTHCHECKRESPONSE']._serialized_start=3847
_globals['_HEALTHCHECKRESPONSE']._serialized_end=3902
_globals['_ABORTREQUEST']._serialized_start=3904
_globals['_ABORTREQUEST']._serialized_end=3954
_globals['_ABORTRESPONSE']._serialized_start=3956
_globals['_ABORTRESPONSE']._serialized_end=4005
_globals['_LOADLORAREQUEST']._serialized_start=4007
_globals['_LOADLORAREQUEST']._serialized_end=4080
_globals['_LOADLORARESPONSE']._serialized_start=4082
_globals['_LOADLORARESPONSE']._serialized_end=4154
_globals['_UNLOADLORAREQUEST']._serialized_start=4156
_globals['_UNLOADLORAREQUEST']._serialized_end=4195
_globals['_UNLOADLORARESPONSE']._serialized_start=4197
_globals['_UNLOADLORARESPONSE']._serialized_end=4251
_globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4253
_globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4372
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4374
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4431
_globals['_GETINTERNALSTATEREQUEST']._serialized_start=4433
_globals['_GETINTERNALSTATEREQUEST']._serialized_end=4478
_globals['_GETINTERNALSTATERESPONSE']._serialized_start=4480
_globals['_GETINTERNALSTATERESPONSE']._serialized_end=4546
_globals['_SETINTERNALSTATEREQUEST']._serialized_start=4548
_globals['_SETINTERNALSTATEREQUEST']._serialized_end=4613
_globals['_SETINTERNALSTATERESPONSE']._serialized_start=4615
_globals['_SETINTERNALSTATERESPONSE']._serialized_end=4675
_globals['_SGLANGSCHEDULER']._serialized_start=4678
_globals['_SGLANGSCHEDULER']._serialized_end=5060
# @@protoc_insertion_point(module_scope)
This diff is collapsed.
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import warnings
from . import sglang_scheduler_pb2 as sglang__scheduler__pb2
GRPC_GENERATED_VERSION = '1.74.0'
GRPC_VERSION = grpc.__version__
_version_not_supported = False
try:
from grpc._utilities import first_version_is_lower
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
except ImportError:
_version_not_supported = True
if _version_not_supported:
raise RuntimeError(
f'The grpc package installed is at version {GRPC_VERSION},'
+ f' but the generated code in sglang_scheduler_pb2_grpc.py depends on'
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
)
class SglangSchedulerStub(object):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Generate = channel.unary_stream(
'/sglang.grpc.scheduler.SglangScheduler/Generate',
request_serializer=sglang__scheduler__pb2.GenerateRequest.SerializeToString,
response_deserializer=sglang__scheduler__pb2.GenerateResponse.FromString,
_registered_method=True)
self.Embed = channel.unary_unary(
'/sglang.grpc.scheduler.SglangScheduler/Embed',
request_serializer=sglang__scheduler__pb2.EmbedRequest.SerializeToString,
response_deserializer=sglang__scheduler__pb2.EmbedResponse.FromString,
_registered_method=True)
self.HealthCheck = channel.unary_unary(
'/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
request_serializer=sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
response_deserializer=sglang__scheduler__pb2.HealthCheckResponse.FromString,
_registered_method=True)
self.Abort = channel.unary_unary(
'/sglang.grpc.scheduler.SglangScheduler/Abort',
request_serializer=sglang__scheduler__pb2.AbortRequest.SerializeToString,
response_deserializer=sglang__scheduler__pb2.AbortResponse.FromString,
_registered_method=True)
class SglangSchedulerServicer(object):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
def Generate(self, request, context):
"""Submit a generation request (supports streaming)
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embed(self, request, context):
"""Submit an embedding request
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def HealthCheck(self, request, context):
"""Health check and metrics
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Abort(self, request, context):
"""Abort a running request
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_SglangSchedulerServicer_to_server(servicer, server):
rpc_method_handlers = {
'Generate': grpc.unary_stream_rpc_method_handler(
servicer.Generate,
request_deserializer=sglang__scheduler__pb2.GenerateRequest.FromString,
response_serializer=sglang__scheduler__pb2.GenerateResponse.SerializeToString,
),
'Embed': grpc.unary_unary_rpc_method_handler(
servicer.Embed,
request_deserializer=sglang__scheduler__pb2.EmbedRequest.FromString,
response_serializer=sglang__scheduler__pb2.EmbedResponse.SerializeToString,
),
'HealthCheck': grpc.unary_unary_rpc_method_handler(
servicer.HealthCheck,
request_deserializer=sglang__scheduler__pb2.HealthCheckRequest.FromString,
response_serializer=sglang__scheduler__pb2.HealthCheckResponse.SerializeToString,
),
'Abort': grpc.unary_unary_rpc_method_handler(
servicer.Abort,
request_deserializer=sglang__scheduler__pb2.AbortRequest.FromString,
response_serializer=sglang__scheduler__pb2.AbortResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
# This class is part of an EXPERIMENTAL API.
class SglangScheduler(object):
"""Service definition for SGLang scheduler communication
This protocol bridges the Rust router and Python scheduler
"""
@staticmethod
def Generate(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(
request,
target,
'/sglang.grpc.scheduler.SglangScheduler/Generate',
sglang__scheduler__pb2.GenerateRequest.SerializeToString,
sglang__scheduler__pb2.GenerateResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def Embed(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/sglang.grpc.scheduler.SglangScheduler/Embed',
sglang__scheduler__pb2.EmbedRequest.SerializeToString,
sglang__scheduler__pb2.EmbedResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def HealthCheck(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
sglang__scheduler__pb2.HealthCheckResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def Abort(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/sglang.grpc.scheduler.SglangScheduler/Abort',
sglang__scheduler__pb2.AbortRequest.SerializeToString,
sglang__scheduler__pb2.AbortResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
...@@ -2238,6 +2238,7 @@ class ServerArgs: ...@@ -2238,6 +2238,7 @@ class ServerArgs:
args.pp_size = args.pipeline_parallel_size args.pp_size = args.pipeline_parallel_size
args.dp_size = args.data_parallel_size args.dp_size = args.data_parallel_size
args.ep_size = args.expert_parallel_size args.ep_size = args.expert_parallel_size
attrs = [attr.name for attr in dataclasses.fields(cls)] attrs = [attr.name for attr in dataclasses.fields(cls)]
return cls(**{attr: getattr(args, attr) for attr in attrs}) return cls(**{attr: getattr(args, attr) for attr in attrs})
......
...@@ -37,21 +37,6 @@ impl SglangSchedulerClient { ...@@ -37,21 +37,6 @@ impl SglangSchedulerClient {
Ok(Self { client }) Ok(Self { client })
} }
/// Initialize the connection
pub async fn initialize(
&mut self,
client_id: String,
) -> Result<proto::InitializeResponse, Box<dyn std::error::Error>> {
let request = Request::new(proto::InitializeRequest {
client_id,
client_version: "0.1.0".to_string(),
mode: proto::initialize_request::Mode::Regular as i32,
});
let response = self.client.initialize(request).await?;
Ok(response.into_inner())
}
/// Submit a generation request (returns streaming response) /// Submit a generation request (returns streaming response)
pub async fn generate_stream( pub async fn generate_stream(
&mut self, &mut self,
...@@ -68,7 +53,10 @@ impl SglangSchedulerClient { ...@@ -68,7 +53,10 @@ impl SglangSchedulerClient {
) -> Result<proto::HealthCheckResponse, Box<dyn std::error::Error>> { ) -> Result<proto::HealthCheckResponse, Box<dyn std::error::Error>> {
debug!("Sending health check request"); debug!("Sending health check request");
let request = Request::new(proto::HealthCheckRequest { let request = Request::new(proto::HealthCheckRequest {
include_detailed_metrics: false, tokenized: Some(proto::TokenizedInput {
original_text: "Hello".to_string(),
input_ids: vec![9906], // Mock token ID for "Hello"
}),
}); });
let response = self.client.health_check(request).await?; let response = self.client.health_check(request).await?;
...@@ -87,21 +75,6 @@ impl SglangSchedulerClient { ...@@ -87,21 +75,6 @@ impl SglangSchedulerClient {
self.client.abort(request).await?; self.client.abort(request).await?;
Ok(()) Ok(())
} }
/// Flush cache
pub async fn flush_cache(
&mut self,
flush_all: bool,
session_ids: &[String],
) -> Result<proto::FlushCacheResponse, Box<dyn std::error::Error>> {
let request = Request::new(proto::FlushCacheRequest {
flush_all,
session_ids: session_ids.to_vec(),
});
let response = self.client.flush_cache(request).await?;
Ok(response.into_inner())
}
} }
#[cfg(test)] #[cfg(test)]
...@@ -111,14 +84,13 @@ mod tests { ...@@ -111,14 +84,13 @@ mod tests {
#[test] #[test]
fn test_proto_types_compilation() { fn test_proto_types_compilation() {
// Test that protobuf types can be constructed // Test that protobuf types can be constructed
let init_req = proto::InitializeRequest { let health_req = proto::HealthCheckRequest {
client_id: "test-client".to_string(), tokenized: Some(proto::TokenizedInput {
client_version: "0.1.0".to_string(), original_text: "test".to_string(),
mode: 0, input_ids: vec![1296],
}),
}; };
assert_eq!(init_req.client_id, "test-client"); assert!(health_req.tokenized.is_some());
assert_eq!(init_req.client_version, "0.1.0");
assert_eq!(init_req.mode, 0);
} }
#[test] #[test]
...@@ -134,9 +106,10 @@ mod tests { ...@@ -134,9 +106,10 @@ mod tests {
let gen_req = proto::GenerateRequest { let gen_req = proto::GenerateRequest {
request_id: "test-req-123".to_string(), request_id: "test-req-123".to_string(),
input: Some(proto::generate_request::Input::Text( tokenized: Some(proto::TokenizedInput {
"Hello world".to_string(), original_text: "Hello world".to_string(),
)), input_ids: vec![9906, 1917], // Mock token IDs for "Hello world"
}),
sampling_params: Some(sampling_params), sampling_params: Some(sampling_params),
return_logprob: true, return_logprob: true,
logprob_start_len: 0, logprob_start_len: 0,
...@@ -145,8 +118,8 @@ mod tests { ...@@ -145,8 +118,8 @@ mod tests {
}; };
assert_eq!(gen_req.request_id, "test-req-123"); assert_eq!(gen_req.request_id, "test-req-123");
if let Some(proto::generate_request::Input::Text(text)) = &gen_req.input { if let Some(ref tokenized) = &gen_req.tokenized {
assert_eq!(text, "Hello world"); assert_eq!(tokenized.original_text, "Hello world");
} }
assert!(gen_req.return_logprob); assert!(gen_req.return_logprob);
assert_eq!(gen_req.top_logprobs_num, 5); assert_eq!(gen_req.top_logprobs_num, 5);
...@@ -160,9 +133,12 @@ mod tests { ...@@ -160,9 +133,12 @@ mod tests {
#[test] #[test]
fn test_health_check_request() { fn test_health_check_request() {
let health_req = proto::HealthCheckRequest { let health_req = proto::HealthCheckRequest {
include_detailed_metrics: true, tokenized: Some(proto::TokenizedInput {
original_text: "test".to_string(),
input_ids: vec![1296], // Mock token ID for "test"
}),
}; };
assert!(health_req.include_detailed_metrics); assert!(health_req.tokenized.is_some());
} }
#[test] #[test]
...@@ -175,17 +151,6 @@ mod tests { ...@@ -175,17 +151,6 @@ mod tests {
assert_eq!(abort_req.reason, "User canceled"); assert_eq!(abort_req.reason, "User canceled");
} }
#[test]
fn test_flush_cache_request() {
let flush_req = proto::FlushCacheRequest {
flush_all: true,
session_ids: vec!["session1".to_string(), "session2".to_string()],
};
assert!(flush_req.flush_all);
assert_eq!(flush_req.session_ids.len(), 2);
assert_eq!(flush_req.session_ids[0], "session1");
}
#[test] #[test]
fn test_sampling_params_defaults() { fn test_sampling_params_defaults() {
let params = proto::SamplingParams::default(); let params = proto::SamplingParams::default();
...@@ -214,38 +179,29 @@ mod tests { ...@@ -214,38 +179,29 @@ mod tests {
assert_eq!(mm_inputs.modalities[0], "image"); assert_eq!(mm_inputs.modalities[0], "image");
} }
#[test] // TODO: SessionParams not in current proto - skip test
fn test_session_params() { // #[test]
let session_params = proto::SessionParams { // fn test_session_params() { ... }
session_id: "sess-789".to_string(),
request_id: "req-101".to_string(),
offset: 100,
replace: true,
drop_previous_output: false,
};
assert_eq!(session_params.session_id, "sess-789");
assert_eq!(session_params.request_id, "req-101");
assert_eq!(session_params.offset, 100);
assert!(session_params.replace);
assert!(!session_params.drop_previous_output);
}
#[test] #[test]
fn test_embed_request() { fn test_embed_request() {
let embed_req = proto::EmbedRequest { let embed_req = proto::EmbedRequest {
request_id: "embed-req-202".to_string(), request_id: "embed-req-202".to_string(),
input: Some(proto::embed_request::Input::Text( tokenized: Some(proto::TokenizedInput {
"This is a test sentence for embedding".to_string(), original_text: "This is a test sentence for embedding".to_string(),
)), input_ids: vec![2028, 374, 264, 1296, 11914, 369, 28537], // Mock token IDs
}),
log_metrics: true, log_metrics: true,
data_parallel_rank: 0, data_parallel_rank: 0,
..Default::default() ..Default::default()
}; };
assert_eq!(embed_req.request_id, "embed-req-202"); assert_eq!(embed_req.request_id, "embed-req-202");
if let Some(proto::embed_request::Input::Text(text)) = &embed_req.input { if let Some(ref tokenized) = &embed_req.tokenized {
assert_eq!(text, "This is a test sentence for embedding"); assert_eq!(
tokenized.original_text,
"This is a test sentence for embedding"
);
} }
assert!(embed_req.log_metrics); assert!(embed_req.log_metrics);
assert_eq!(embed_req.data_parallel_rank, 0); assert_eq!(embed_req.data_parallel_rank, 0);
...@@ -292,36 +248,7 @@ mod tests { ...@@ -292,36 +248,7 @@ mod tests {
assert_eq!(chunk.queue_time, 10); assert_eq!(chunk.queue_time, 10);
} }
#[test] // TODO: ModelInfo not in current proto - skip test
fn test_model_info() { // #[test]
let model_info = proto::ModelInfo { // fn test_model_info() { ... }
model_name: "Meta-Llama-3-8B-Instruct".to_string(),
max_context_length: 8192,
vocab_size: 128256,
supports_tool_calling: true,
supports_vision: false,
special_tokens: vec![
"<|begin_of_text|>".to_string(),
"<|end_of_text|>".to_string(),
],
model_type: "llama".to_string(),
num_layers: 32,
hidden_size: 4096,
num_attention_heads: 32,
num_key_value_heads: 8,
tokenizer_type: "llama".to_string(),
eos_token_ids: vec![128001, 128009],
pad_token_id: 128001,
bos_token_id: 128000,
};
assert_eq!(model_info.model_name, "Meta-Llama-3-8B-Instruct");
assert_eq!(model_info.max_context_length, 8192);
assert_eq!(model_info.vocab_size, 128256);
assert!(model_info.supports_tool_calling);
assert!(!model_info.supports_vision);
assert_eq!(model_info.special_tokens.len(), 2);
assert_eq!(model_info.num_layers, 32);
assert_eq!(model_info.eos_token_ids, vec![128001, 128009]);
}
} }
...@@ -8,9 +8,6 @@ import "google/protobuf/struct.proto"; ...@@ -8,9 +8,6 @@ import "google/protobuf/struct.proto";
// Service definition for SGLang scheduler communication // Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler // This protocol bridges the Rust router and Python scheduler
service SglangScheduler { service SglangScheduler {
// Initialize connection and get model info
rpc Initialize(InitializeRequest) returns (InitializeResponse);
// Submit a generation request (supports streaming) // Submit a generation request (supports streaming)
rpc Generate(GenerateRequest) returns (stream GenerateResponse); rpc Generate(GenerateRequest) returns (stream GenerateResponse);
...@@ -23,8 +20,6 @@ service SglangScheduler { ...@@ -23,8 +20,6 @@ service SglangScheduler {
// Abort a running request // Abort a running request
rpc Abort(AbortRequest) returns (AbortResponse); rpc Abort(AbortRequest) returns (AbortResponse);
// Flush KV cache
rpc FlushCache(FlushCacheRequest) returns (FlushCacheResponse);
} }
// ===================== // =====================
...@@ -75,14 +70,6 @@ message SamplingParams { ...@@ -75,14 +70,6 @@ message SamplingParams {
google.protobuf.Struct custom_params = 25; google.protobuf.Struct custom_params = 25;
} }
// Session parameters for continual prompting
message SessionParams {
string session_id = 1;
string request_id = 2;
int32 offset = 3;
bool replace = 4;
bool drop_previous_output = 5;
}
// Disaggregated serving parameters // Disaggregated serving parameters
message DisaggregatedParams { message DisaggregatedParams {
...@@ -91,87 +78,6 @@ message DisaggregatedParams { ...@@ -91,87 +78,6 @@ message DisaggregatedParams {
int32 bootstrap_room = 3; int32 bootstrap_room = 3;
} }
// =====================
// Initialize
// =====================
message InitializeRequest {
string client_id = 1;
string client_version = 2;
// Operating mode
enum Mode {
REGULAR = 0; // Normal mode with local scheduler
PREFILL = 1; // Prefill-only mode for disaggregated serving
DECODE = 2; // Decode-only mode for disaggregated serving
}
Mode mode = 3;
}
message InitializeResponse {
bool success = 1;
string scheduler_version = 2;
// Model information
ModelInfo model_info = 3;
// Server capabilities
ServerCapabilities capabilities = 4;
// Error message if success is false
string error_message = 5;
}
message ModelInfo {
string model_name = 1;
int32 max_context_length = 2;
int32 vocab_size = 3;
bool supports_tool_calling = 4;
bool supports_vision = 5;
repeated string special_tokens = 6;
// Additional model metadata
string model_type = 7;
int32 num_layers = 8;
int32 hidden_size = 9;
int32 num_attention_heads = 10;
int32 num_key_value_heads = 11;
// Tokenizer info
string tokenizer_type = 12;
repeated int32 eos_token_ids = 13;
int32 pad_token_id = 14;
int32 bos_token_id = 15;
}
message ServerCapabilities {
bool continuous_batching = 1;
bool disaggregated_serving = 2;
bool speculative_decoding = 3;
int32 max_batch_size = 4;
int32 max_num_batched_tokens = 5;
int32 max_prefill_tokens = 6;
string attention_backend = 7; // "flashinfer", "triton", "torch"
// Additional capabilities
bool supports_lora = 8;
bool supports_grammar = 9;
bool supports_multimodal = 10;
repeated string supported_modalities = 11; // ["image", "video", "audio"]
bool supports_custom_logit_processor = 12;
bool supports_session = 13;
// Hardware info
int32 num_gpus = 14;
string gpu_type = 15;
int64 total_gpu_memory = 16;
// Parallelism info
int32 tensor_parallel_size = 17;
int32 pipeline_parallel_size = 18;
int32 data_parallel_size = 19;
}
// ===================== // =====================
// Generate Request // Generate Request
// ===================== // =====================
...@@ -179,49 +85,43 @@ message ServerCapabilities { ...@@ -179,49 +85,43 @@ message ServerCapabilities {
message GenerateRequest { message GenerateRequest {
string request_id = 1; string request_id = 1;
// Input can be either text or tokenized // Input must be tokenized (no raw text)
oneof input { TokenizedInput tokenized = 2;
string text = 2;
TokenizedInput tokenized = 3;
}
// Multimodal inputs // Multimodal inputs
MultimodalInputs mm_inputs = 4; MultimodalInputs mm_inputs = 3;
// Generation parameters // Generation parameters
SamplingParams sampling_params = 5; SamplingParams sampling_params = 4;
// Return options // Return options
bool return_logprob = 6; bool return_logprob = 5;
int32 logprob_start_len = 7; int32 logprob_start_len = 6;
int32 top_logprobs_num = 8; int32 top_logprobs_num = 7;
repeated int32 token_ids_logprob = 9; repeated int32 token_ids_logprob = 8;
bool return_hidden_states = 10; bool return_hidden_states = 9;
// Session management
SessionParams session_params = 11;
// For disaggregated serving // For disaggregated serving
DisaggregatedParams disaggregated_params = 12; DisaggregatedParams disaggregated_params = 10;
// Custom logit processor (serialized) // Custom logit processor (serialized)
string custom_logit_processor = 13; string custom_logit_processor = 11;
// Request metadata // Request metadata
google.protobuf.Timestamp timestamp = 14; google.protobuf.Timestamp timestamp = 12;
bool log_metrics = 15; bool log_metrics = 13;
// Input embeddings (alternative to text/tokens) // Input embeddings (alternative to text/tokens)
repeated float input_embeds = 16; repeated float input_embeds = 14;
// LoRA adapter ID (if pre-loaded) // LoRA adapter ID (if pre-loaded)
string lora_id = 17; string lora_id = 15;
// Data parallel routing // Data parallel routing
int32 data_parallel_rank = 18; int32 data_parallel_rank = 16;
// For load balancing // For load balancing
int32 dp_balance_id = 19; int32 dp_balance_id = 17;
} }
message TokenizedInput { message TokenizedInput {
...@@ -303,19 +203,6 @@ message GenerateComplete { ...@@ -303,19 +203,6 @@ message GenerateComplete {
} }
FinishReason finish_reason = 3; FinishReason finish_reason = 3;
// Final counts
int32 prompt_tokens = 4;
int32 completion_tokens = 5;
int32 cached_tokens = 6;
// Performance metrics
float total_generation_time = 7;
float time_to_first_token = 8;
float tokens_per_second = 9;
// Spec decode metrics
int32 spec_verify_count = 10;
// All logprobs if requested // All logprobs if requested
repeated LogProbs all_logprobs = 11; repeated LogProbs all_logprobs = 11;
...@@ -359,10 +246,8 @@ message HiddenStates { ...@@ -359,10 +246,8 @@ message HiddenStates {
message EmbedRequest { message EmbedRequest {
string request_id = 1; string request_id = 1;
oneof input { // Input must be tokenized (no raw text)
string text = 2; TokenizedInput tokenized = 2;
TokenizedInput tokenized = 3;
}
// Multimodal inputs // Multimodal inputs
MultimodalInputs mm_inputs = 4; MultimodalInputs mm_inputs = 4;
...@@ -422,39 +307,13 @@ message EmbedError { ...@@ -422,39 +307,13 @@ message EmbedError {
// ===================== // =====================
message HealthCheckRequest { message HealthCheckRequest {
bool include_detailed_metrics = 1; // Input for health test generation (must be tokenized)
TokenizedInput tokenized = 1;
} }
message HealthCheckResponse { message HealthCheckResponse {
bool healthy = 1; bool healthy = 1;
string message = 2;
// Current load metrics
int32 num_requests_running = 2;
int32 num_requests_waiting = 3;
float gpu_cache_usage = 4;
float gpu_memory_usage = 5;
// KV cache metrics
int32 kv_cache_total_blocks = 6;
int32 kv_cache_used_blocks = 7;
float kv_cache_hit_rate = 8;
// Additional metrics
int32 num_grammar_queue_requests = 9;
float generation_throughput = 10; // tokens/sec
float average_queue_time = 11; // seconds
float average_generation_time = 12; // seconds
// System metrics
float cpu_usage = 13;
int64 memory_usage = 14;
// Disaggregation metrics
int32 num_prefill_requests = 15;
int32 num_decode_requests = 16;
// Detailed metrics (optional)
google.protobuf.Struct detailed_metrics = 17;
} }
message AbortRequest { message AbortRequest {
...@@ -467,17 +326,6 @@ message AbortResponse { ...@@ -467,17 +326,6 @@ message AbortResponse {
string message = 2; string message = 2;
} }
message FlushCacheRequest {
bool flush_all = 1;
repeated string session_ids = 2; // Flush specific sessions
}
message FlushCacheResponse {
bool success = 1;
int32 num_entries_flushed = 2;
int64 memory_freed = 3; // bytes
string message = 4;
}
// ===================== // =====================
// Additional Operations (Future) // Additional Operations (Future)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment