syntax = "proto3"; package sglang.grpc.scheduler; import "google/protobuf/timestamp.proto"; import "google/protobuf/struct.proto"; // Service definition for SGLang scheduler communication // This protocol bridges the Rust router and Python scheduler service SglangScheduler { // Submit a generation request (supports streaming) rpc Generate(GenerateRequest) returns (stream GenerateResponse); // Submit an embedding request rpc Embed(EmbedRequest) returns (EmbedResponse); // Health check and metrics rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse); // Abort a running request rpc Abort(AbortRequest) returns (AbortResponse); // Get model information rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse); // Get server information rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse); } // ===================== // Common Types // ===================== // Sampling parameters matching SGLang's SamplingParams // // IMPORTANT: Do not use SamplingParams::default() directly! // The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults // (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values // or use the conversion functions in sglang_scheduler.rs / grpc_server.py. message SamplingParams { float temperature = 1; float top_p = 2; int32 top_k = 3; float min_p = 4; float frequency_penalty = 5; float presence_penalty = 6; float repetition_penalty = 7; optional int32 max_new_tokens = 8; repeated string stop = 9; repeated uint32 stop_token_ids = 10; bool skip_special_tokens = 11; bool spaces_between_special_tokens = 12; // Structured generation oneof constraint { string regex = 13; string json_schema = 14; string ebnf_grammar = 15; string structural_tag = 16; } // Speculative decoding int32 n = 17; // Number of samples // Additional parameters int32 min_new_tokens = 18; bool ignore_eos = 19; bool no_stop_trim = 20; optional int32 stream_interval = 21; map logit_bias = 22; // Custom parameters for extensibility google.protobuf.Struct custom_params = 23; } // Disaggregated serving parameters message DisaggregatedParams { string bootstrap_host = 1; int32 bootstrap_port = 2; int32 bootstrap_room = 3; } // ===================== // Generate Request // ===================== message GenerateRequest { string request_id = 1; // Input must be tokenized (no raw text) TokenizedInput tokenized = 2; // Multimodal inputs MultimodalInputs mm_inputs = 3; // Generation parameters SamplingParams sampling_params = 4; // Return options bool return_logprob = 5; int32 logprob_start_len = 6; int32 top_logprobs_num = 7; repeated uint32 token_ids_logprob = 8; bool return_hidden_states = 9; // For disaggregated serving DisaggregatedParams disaggregated_params = 10; // Custom logit processor (serialized) string custom_logit_processor = 11; // Request metadata google.protobuf.Timestamp timestamp = 12; bool log_metrics = 13; // Input embeddings (alternative to text/tokens) repeated float input_embeds = 14; // LoRA adapter ID (if pre-loaded) string lora_id = 15; // Data parallel routing int32 data_parallel_rank = 16; // Whether client wants streaming response bool stream = 17; } message TokenizedInput { string original_text = 1; // For reference repeated uint32 input_ids = 2; } message MultimodalInputs { // Simplified multimodal handling - actual data processed by tokenizer repeated string image_urls = 1; repeated string video_urls = 2; repeated string audio_urls = 3; // Pre-processed multimodal features (if available) google.protobuf.Struct processed_features = 4; // Raw data for direct processing repeated bytes image_data = 5; repeated bytes video_data = 6; repeated bytes audio_data = 7; // Modality metadata repeated string modalities = 8; } // ===================== // Generate Response // ===================== message GenerateResponse { string request_id = 1; // Response type oneof response { GenerateStreamChunk chunk = 2; GenerateComplete complete = 3; GenerateError error = 4; } } message GenerateStreamChunk { // Generated tokens (incremental chunk) repeated uint32 token_ids = 1; // Cumulative counts int32 prompt_tokens = 2; int32 completion_tokens = 3; int32 cached_tokens = 4; // Output logprobs (if requested) - incremental for streaming OutputLogProbs output_logprobs = 5; // Hidden states (if requested) repeated float hidden_states = 6; // Input logprobs (if requested) - only in first chunk InputLogProbs input_logprobs = 7; // Index for ordering when n>1 (for parallel request multiplexing) uint32 index = 8; } message GenerateComplete { // Final output repeated uint32 output_ids = 1; // Finish reason as OpenAI-compatible string ("stop", "length", "abort") string finish_reason = 2; // Token usage counts int32 prompt_tokens = 3; int32 completion_tokens = 4; int32 cached_tokens = 5; // Output logprobs if requested (cumulative) OutputLogProbs output_logprobs = 6; // All hidden states if requested repeated HiddenStates all_hidden_states = 7; // Matched stop information (for stop sequences) oneof matched_stop { uint32 matched_token_id = 8; string matched_stop_str = 9; } // Input logprobs if requested (for prompt tokens) InputLogProbs input_logprobs = 10; // Index for ordering when n>1 (for parallel request multiplexing) uint32 index = 11; } message GenerateError { string message = 1; string http_status_code = 2; string details = 3; } // Output logprobs - all values are present (no None) message OutputLogProbs { repeated float token_logprobs = 1; repeated int32 token_ids = 2; // Top logprobs at each position repeated TopLogProbs top_logprobs = 3; } // Input logprobs - first token has no logprob (None) message InputLogProbs { repeated InputTokenLogProb token_logprobs = 1; repeated int32 token_ids = 2; // Top logprobs at each position repeated TopLogProbs top_logprobs = 3; } // Wrapper to represent optional logprob (first input token has no logprob) message InputTokenLogProb { optional float value = 1; } message TopLogProbs { repeated float values = 1; repeated int32 token_ids = 2; } message HiddenStates { repeated float values = 1; int32 layer = 2; int32 position = 3; } // ===================== // Embedding Request // ===================== message EmbedRequest { string request_id = 1; // Input must be tokenized (no raw text) TokenizedInput tokenized = 2; // Multimodal inputs MultimodalInputs mm_inputs = 4; // Dummy sampling params for compatibility // EmbedRequest doesn't use sampling_params SamplingParams sampling_params = 5; bool log_metrics = 6; // Token type IDs for models that require them repeated int32 token_type_ids = 7; // Data parallel routing int32 data_parallel_rank = 8; // For cross-encoder requests bool is_cross_encoder = 9; repeated string texts = 10; // For cross-encoder batch } message EmbedResponse { string request_id = 1; oneof response { EmbedComplete complete = 2; EmbedError error = 3; } } message EmbedComplete { repeated float embedding = 1; int32 prompt_tokens = 2; int32 cached_tokens = 3; // Additional metadata int32 embedding_dim = 4; // For batch embeddings repeated Embedding batch_embeddings = 5; } message Embedding { repeated float values = 1; int32 index = 2; } message EmbedError { string message = 1; string code = 2; string details = 3; } // ===================== // Management Operations // ===================== message HealthCheckRequest { // Input for health test generation (must be tokenized) TokenizedInput tokenized = 1; } message HealthCheckResponse { bool healthy = 1; string message = 2; } message AbortRequest { string request_id = 1; string reason = 2; } message AbortResponse { bool success = 1; string message = 2; } // ===================== // Additional Operations (Future) // ===================== // Load LoRA adapter message LoadLoRARequest { string adapter_id = 1; string adapter_path = 2; int32 rank = 3; } message LoadLoRAResponse { bool success = 1; string adapter_id = 2; string message = 3; } // Unload LoRA adapter message UnloadLoRARequest { string adapter_id = 1; } message UnloadLoRAResponse { bool success = 1; string message = 2; } // Update weights message UpdateWeightsRequest { oneof source { string disk_path = 1; bytes tensor_data = 2; string remote_url = 3; } string weight_name = 4; } message UpdateWeightsResponse { bool success = 1; string message = 2; } // Get internal state for debugging message GetInternalStateRequest { repeated string state_keys = 1; } message GetInternalStateResponse { google.protobuf.Struct state = 1; } // Set internal state for testing message SetInternalStateRequest { google.protobuf.Struct state = 1; } message SetInternalStateResponse { bool success = 1; string message = 2; } // ===================== // Model and Server Info // ===================== // Get model information message GetModelInfoRequest {} message GetModelInfoResponse { string model_path = 1; string tokenizer_path = 2; bool is_generation = 3; string preferred_sampling_params = 4; // JSON string or empty string weight_version = 5; string served_model_name = 6; int32 max_context_length = 7; int32 vocab_size = 8; bool supports_vision = 9; string model_type = 10; repeated int32 eos_token_ids = 11; int32 pad_token_id = 12; int32 bos_token_id = 13; int32 max_req_input_len = 14; } // Get server information message GetServerInfoRequest {} message GetServerInfoResponse { // Server configuration (as structured data) google.protobuf.Struct server_args = 1; // Scheduler metrics (from scheduler initialization) google.protobuf.Struct scheduler_info = 2; // Runtime state int32 active_requests = 3; bool is_paused = 4; double last_receive_timestamp = 5; double uptime_seconds = 6; // Version info string sglang_version = 7; // Server metadata string server_type = 8; // "grpc" google.protobuf.Timestamp start_time = 9; // Note: internal_states not provided in gRPC mode // Scheduler-side metrics (memory usage, throughput) require // bidirectional communicator infrastructure not available in gRPC. // Use HTTP /get_server_info if scheduler internal state is needed. }