sglang_scheduler.proto

syntax = "proto3";

package sglang.grpc.scheduler;

import "google/protobuf/timestamp.proto";
import "google/protobuf/struct.proto";

// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
service SglangScheduler {
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);

  // Submit an embedding request
  rpc Embed(EmbedRequest) returns (EmbedResponse);

  // Health check and metrics
  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

  // Abort a running request
  rpc Abort(AbortRequest) returns (AbortResponse);

  // Get model information
  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

  // Get server information
  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);

}

// =====================
// Common Types
// =====================

// Sampling parameters matching SGLang's SamplingParams
//
// IMPORTANT: Do not use SamplingParams::default() directly!
// The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults
// (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values
// or use the conversion functions in sglang_scheduler.rs / grpc_server.py.
message SamplingParams {
  float temperature = 1;
  float top_p = 2;
  int32 top_k = 3;
  float min_p = 4;
  float frequency_penalty = 5;
  float presence_penalty = 6;
  float repetition_penalty = 7;

  optional int32 max_new_tokens = 8;
  repeated string stop = 9;
  repeated uint32 stop_token_ids = 10;
  bool skip_special_tokens = 11;
  bool spaces_between_special_tokens = 12;

  // Structured generation
  oneof constraint {
    string regex = 13;
    string json_schema = 14;
    string ebnf_grammar = 15;
    string structural_tag = 16;
  }

  // Speculative decoding
  int32 n = 17;  // Number of samples

  // Additional parameters
  int32 min_new_tokens = 18;
  bool ignore_eos = 19;
  bool no_stop_trim = 20;
  optional int32 stream_interval = 21;
  map<string, float> logit_bias = 22;

  // Custom parameters for extensibility
  google.protobuf.Struct custom_params = 23;
}


// Disaggregated serving parameters
message DisaggregatedParams {
  string bootstrap_host = 1;
  int32 bootstrap_port = 2;
  int32 bootstrap_room = 3;
}

// =====================
// Generate Request
// =====================

message GenerateRequest {
  string request_id = 1;

  // Input must be tokenized (no raw text)
  TokenizedInput tokenized = 2;

  // Multimodal inputs
  MultimodalInputs mm_inputs = 3;

  // Generation parameters
  SamplingParams sampling_params = 4;

  // Return options
  bool return_logprob = 5;
  int32 logprob_start_len = 6;
  int32 top_logprobs_num = 7;
  repeated uint32 token_ids_logprob = 8;
  bool return_hidden_states = 9;

  // For disaggregated serving
  DisaggregatedParams disaggregated_params = 10;

  // Custom logit processor (serialized)
  string custom_logit_processor = 11;

  // Request metadata
  google.protobuf.Timestamp timestamp = 12;
  bool log_metrics = 13;

  // Input embeddings (alternative to text/tokens)
  repeated float input_embeds = 14;

  // LoRA adapter ID (if pre-loaded)
  string lora_id = 15;

  // Data parallel routing
  int32 data_parallel_rank = 16;

  // Whether client wants streaming response
  bool stream = 17;
}

message TokenizedInput {
  string original_text = 1;  // For reference
  repeated uint32 input_ids = 2;
}

message MultimodalInputs {
  // Simplified multimodal handling - actual data processed by tokenizer
  repeated string image_urls = 1;
  repeated string video_urls = 2;
  repeated string audio_urls = 3;

  // Pre-processed multimodal features (if available)
  google.protobuf.Struct processed_features = 4;

  // Raw data for direct processing
  repeated bytes image_data = 5;
  repeated bytes video_data = 6;
  repeated bytes audio_data = 7;

  // Modality metadata
  repeated string modalities = 8;
}

// =====================
// Generate Response
// =====================

message GenerateResponse {
  string request_id = 1;

  // Response type
  oneof response {
    GenerateStreamChunk chunk = 2;
    GenerateComplete complete = 3;
    GenerateError error = 4;
  }
}

message GenerateStreamChunk {
  // Generated tokens (incremental chunk)
  repeated uint32 token_ids = 1;

  // Cumulative counts
  int32 prompt_tokens = 2;
  int32 completion_tokens = 3;
  int32 cached_tokens = 4;

  // Output logprobs (if requested) - incremental for streaming
  OutputLogProbs output_logprobs = 5;

  // Hidden states (if requested)
  repeated float hidden_states = 6;

  // Input logprobs (if requested) - only in first chunk
  InputLogProbs input_logprobs = 7;

  // Index for ordering when n>1 (for parallel request multiplexing)
  uint32 index = 8;
}

message GenerateComplete {
  // Final output
  repeated uint32 output_ids = 1;

  // Finish reason as OpenAI-compatible string ("stop", "length", "abort")
  string finish_reason = 2;

  // Token usage counts
  int32 prompt_tokens = 3;
  int32 completion_tokens = 4;
  int32 cached_tokens = 5;

  // Output logprobs if requested (cumulative)
  OutputLogProbs output_logprobs = 6;

  // All hidden states if requested
  repeated HiddenStates all_hidden_states = 7;

  // Matched stop information (for stop sequences)
  oneof matched_stop {
    uint32 matched_token_id = 8;
    string matched_stop_str = 9;
  }

  // Input logprobs if requested (for prompt tokens)
  InputLogProbs input_logprobs = 10;

  // Index for ordering when n>1 (for parallel request multiplexing)
  uint32 index = 11;
}

message GenerateError {
  string message = 1;
  string http_status_code = 2;
  string details = 3;
}

// Output logprobs - all values are present (no None)
message OutputLogProbs {
  repeated float token_logprobs = 1;
  repeated int32 token_ids = 2;

  // Top logprobs at each position
  repeated TopLogProbs top_logprobs = 3;
}

// Input logprobs - first token has no logprob (None)
message InputLogProbs {
  repeated InputTokenLogProb token_logprobs = 1;
  repeated int32 token_ids = 2;

  // Top logprobs at each position
  repeated TopLogProbs top_logprobs = 3;
}

// Wrapper to represent optional logprob (first input token has no logprob)
message InputTokenLogProb {
  optional float value = 1;
}

message TopLogProbs {
  repeated float values = 1;
  repeated int32 token_ids = 2;
}

message HiddenStates {
  repeated float values = 1;
  int32 layer = 2;
  int32 position = 3;
}

// =====================
// Embedding Request
// =====================

message EmbedRequest {
  string request_id = 1;

  // Input must be tokenized (no raw text)
  TokenizedInput tokenized = 2;

  // Multimodal inputs
  MultimodalInputs mm_inputs = 4;

  // Dummy sampling params for compatibility
  // EmbedRequest doesn't use sampling_params
  SamplingParams sampling_params = 5;

  bool log_metrics = 6;

  // Token type IDs for models that require them
  repeated int32 token_type_ids = 7;

  // Data parallel routing
  int32 data_parallel_rank = 8;

  // For cross-encoder requests
  bool is_cross_encoder = 9;
  repeated string texts = 10;  // For cross-encoder batch
}

message EmbedResponse {
  string request_id = 1;

  oneof response {
    EmbedComplete complete = 2;
    EmbedError error = 3;
  }
}

message EmbedComplete {
  repeated float embedding = 1;
  int32 prompt_tokens = 2;
  int32 cached_tokens = 3;

  // Additional metadata
  int32 embedding_dim = 4;

  // For batch embeddings
  repeated Embedding batch_embeddings = 5;
}

message Embedding {
  repeated float values = 1;
  int32 index = 2;
}

message EmbedError {
  string message = 1;
  string code = 2;
  string details = 3;
}

// =====================
// Management Operations
// =====================

message HealthCheckRequest {
  // Input for health test generation (must be tokenized)
  TokenizedInput tokenized = 1;
}

message HealthCheckResponse {
  bool healthy = 1;
  string message = 2;
}

message AbortRequest {
  string request_id = 1;
  string reason = 2;
}

message AbortResponse {
  bool success = 1;
  string message = 2;
}


// =====================
// Additional Operations (Future)
// =====================

// Load LoRA adapter
message LoadLoRARequest {
  string adapter_id = 1;
  string adapter_path = 2;
  int32 rank = 3;
}

message LoadLoRAResponse {
  bool success = 1;
  string adapter_id = 2;
  string message = 3;
}

// Unload LoRA adapter
message UnloadLoRARequest {
  string adapter_id = 1;
}

message UnloadLoRAResponse {
  bool success = 1;
  string message = 2;
}

// Update weights
message UpdateWeightsRequest {
  oneof source {
    string disk_path = 1;
    bytes tensor_data = 2;
    string remote_url = 3;
  }
  string weight_name = 4;
}

message UpdateWeightsResponse {
  bool success = 1;
  string message = 2;
}

// Get internal state for debugging
message GetInternalStateRequest {
  repeated string state_keys = 1;
}

message GetInternalStateResponse {
  google.protobuf.Struct state = 1;
}

// Set internal state for testing
message SetInternalStateRequest {
  google.protobuf.Struct state = 1;
}

message SetInternalStateResponse {
  bool success = 1;
  string message = 2;
}

// =====================
// Model and Server Info
// =====================

// Get model information
message GetModelInfoRequest {}

message GetModelInfoResponse {
  string model_path = 1;
  string tokenizer_path = 2;
  bool is_generation = 3;
  string preferred_sampling_params = 4;  // JSON string or empty
  string weight_version = 5;
  string served_model_name = 6;
  int32 max_context_length = 7;
  int32 vocab_size = 8;
  bool supports_vision = 9;
  string model_type = 10;
  repeated int32 eos_token_ids = 11;
  int32 pad_token_id = 12;
  int32 bos_token_id = 13;
  int32 max_req_input_len = 14;
}

// Get server information
message GetServerInfoRequest {}

message GetServerInfoResponse {
  // Server configuration (as structured data)
  google.protobuf.Struct server_args = 1;

  // Scheduler metrics (from scheduler initialization)
  google.protobuf.Struct scheduler_info = 2;

  // Runtime state
  int32 active_requests = 3;
  bool is_paused = 4;
  double last_receive_timestamp = 5;
  double uptime_seconds = 6;

  // Version info
  string sglang_version = 7;

  // Server metadata
  string server_type = 8;  // "grpc"
  google.protobuf.Timestamp start_time = 9;

  // Note: internal_states not provided in gRPC mode
  // Scheduler-side metrics (memory usage, throughput) require
  // bidirectional communicator infrastructure not available in gRPC.
  // Use HTTP /get_server_info if scheduler internal state is needed.
}