sglang_scheduler.proto 10.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
syntax = "proto3";

package sglang.grpc.scheduler;

import "google/protobuf/timestamp.proto";
import "google/protobuf/struct.proto";

// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
10
service SglangScheduler {
11
12
13
14
15
16
17
18
19
20
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);

  // Submit an embedding request
  rpc Embed(EmbedRequest) returns (EmbedResponse);

  // Health check and metrics
  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

  // Abort a running request
21
  rpc Abort(AbortRequest) returns (AbortResponse);
22

23
24
25
26
27
28
  // Get model information
  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

  // Get server information
  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);

29
30
31
32
33
34
35
}

// =====================
// Common Types
// =====================

// Sampling parameters matching SGLang's SamplingParams
36
37
38
39
40
//
// IMPORTANT: Do not use SamplingParams::default() directly!
// The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults
// (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values
// or use the conversion functions in sglang_scheduler.rs / grpc_server.py.
41
42
43
44
45
46
47
48
49
message SamplingParams {
  float temperature = 1;
  float top_p = 2;
  int32 top_k = 3;
  float min_p = 4;
  float frequency_penalty = 5;
  float presence_penalty = 6;
  float repetition_penalty = 7;

50
  optional int32 max_new_tokens = 8;
51
  repeated string stop = 9;
52
  repeated uint32 stop_token_ids = 10;
53
54
55
56
57
58
59
60
  bool skip_special_tokens = 11;
  bool spaces_between_special_tokens = 12;

  // Structured generation
  oneof constraint {
    string regex = 13;
    string json_schema = 14;
    string ebnf_grammar = 15;
61
    string structural_tag = 16;
62
63
64
  }

  // Speculative decoding
65
  int32 n = 17;  // Number of samples
66
67

  // Additional parameters
68
69
70
71
72
  int32 min_new_tokens = 18;
  bool ignore_eos = 19;
  bool no_stop_trim = 20;
  optional int32 stream_interval = 21;
  map<string, float> logit_bias = 22;
73
74

  // Custom parameters for extensibility
75
  google.protobuf.Struct custom_params = 23;
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
}


// Disaggregated serving parameters
message DisaggregatedParams {
  string bootstrap_host = 1;
  int32 bootstrap_port = 2;
  int32 bootstrap_room = 3;
}

// =====================
// Generate Request
// =====================

message GenerateRequest {
  string request_id = 1;

93
94
  // Input must be tokenized (no raw text)
  TokenizedInput tokenized = 2;
95
96

  // Multimodal inputs
97
  MultimodalInputs mm_inputs = 3;
98
99

  // Generation parameters
100
  SamplingParams sampling_params = 4;
101
102

  // Return options
103
104
105
  bool return_logprob = 5;
  int32 logprob_start_len = 6;
  int32 top_logprobs_num = 7;
106
  repeated uint32 token_ids_logprob = 8;
107
  bool return_hidden_states = 9;
108
109

  // For disaggregated serving
110
  DisaggregatedParams disaggregated_params = 10;
111
112

  // Custom logit processor (serialized)
113
  string custom_logit_processor = 11;
114
115

  // Request metadata
116
117
  google.protobuf.Timestamp timestamp = 12;
  bool log_metrics = 13;
118
119

  // Input embeddings (alternative to text/tokens)
120
  repeated float input_embeds = 14;
121
122

  // LoRA adapter ID (if pre-loaded)
123
  string lora_id = 15;
124
125

  // Data parallel routing
126
  int32 data_parallel_rank = 16;
127

128
  // Whether client wants streaming response
129
  bool stream = 17;
130
131
132
133
}

message TokenizedInput {
  string original_text = 1;  // For reference
134
  repeated uint32 input_ids = 2;
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
}

message MultimodalInputs {
  // Simplified multimodal handling - actual data processed by tokenizer
  repeated string image_urls = 1;
  repeated string video_urls = 2;
  repeated string audio_urls = 3;

  // Pre-processed multimodal features (if available)
  google.protobuf.Struct processed_features = 4;

  // Raw data for direct processing
  repeated bytes image_data = 5;
  repeated bytes video_data = 6;
  repeated bytes audio_data = 7;

  // Modality metadata
  repeated string modalities = 8;
}

// =====================
// Generate Response
// =====================

message GenerateResponse {
  string request_id = 1;

  // Response type
  oneof response {
    GenerateStreamChunk chunk = 2;
    GenerateComplete complete = 3;
    GenerateError error = 4;
  }
}

message GenerateStreamChunk {
171
  // Generated tokens (incremental chunk)
172
  repeated uint32 token_ids = 1;
173
174

  // Cumulative counts
175
176
177
  int32 prompt_tokens = 2;
  int32 completion_tokens = 3;
  int32 cached_tokens = 4;
178

179
  // Output logprobs (if requested) - incremental for streaming
180
  OutputLogProbs output_logprobs = 5;
181
182

  // Hidden states (if requested)
183
  repeated float hidden_states = 6;
184
185

  // Input logprobs (if requested) - only in first chunk
186
  InputLogProbs input_logprobs = 7;
187
188
189

  // Index for ordering when n>1 (for parallel request multiplexing)
  uint32 index = 8;
190
191
192
193
}

message GenerateComplete {
  // Final output
194
  repeated uint32 output_ids = 1;
195

196
197
  // Finish reason as OpenAI-compatible string ("stop", "length", "abort")
  string finish_reason = 2;
198
199
200
201
202

  // Token usage counts
  int32 prompt_tokens = 3;
  int32 completion_tokens = 4;
  int32 cached_tokens = 5;
203

204
  // Output logprobs if requested (cumulative)
205
  OutputLogProbs output_logprobs = 6;
206
207

  // All hidden states if requested
208
  repeated HiddenStates all_hidden_states = 7;
209
210
211
212
213
214

  // Matched stop information (for stop sequences)
  oneof matched_stop {
    uint32 matched_token_id = 8;
    string matched_stop_str = 9;
  }
215
216

  // Input logprobs if requested (for prompt tokens)
217
  InputLogProbs input_logprobs = 10;
218
219
220

  // Index for ordering when n>1 (for parallel request multiplexing)
  uint32 index = 11;
221
222
223
224
225
226
227
228
}

message GenerateError {
  string message = 1;
  string http_status_code = 2;
  string details = 3;
}

229
230
// Output logprobs - all values are present (no None)
message OutputLogProbs {
231
232
233
234
235
236
237
  repeated float token_logprobs = 1;
  repeated int32 token_ids = 2;

  // Top logprobs at each position
  repeated TopLogProbs top_logprobs = 3;
}

238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Input logprobs - first token has no logprob (None)
message InputLogProbs {
  repeated InputTokenLogProb token_logprobs = 1;
  repeated int32 token_ids = 2;

  // Top logprobs at each position
  repeated TopLogProbs top_logprobs = 3;
}

// Wrapper to represent optional logprob (first input token has no logprob)
message InputTokenLogProb {
  optional float value = 1;
}

252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
message TopLogProbs {
  repeated float values = 1;
  repeated int32 token_ids = 2;
}

message HiddenStates {
  repeated float values = 1;
  int32 layer = 2;
  int32 position = 3;
}

// =====================
// Embedding Request
// =====================

message EmbedRequest {
  string request_id = 1;

270
271
  // Input must be tokenized (no raw text)
  TokenizedInput tokenized = 2;
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

  // Multimodal inputs
  MultimodalInputs mm_inputs = 4;

  // Dummy sampling params for compatibility
  // EmbedRequest doesn't use sampling_params
  SamplingParams sampling_params = 5;

  bool log_metrics = 6;

  // Token type IDs for models that require them
  repeated int32 token_type_ids = 7;

  // Data parallel routing
  int32 data_parallel_rank = 8;

  // For cross-encoder requests
  bool is_cross_encoder = 9;
  repeated string texts = 10;  // For cross-encoder batch
}

message EmbedResponse {
  string request_id = 1;

  oneof response {
    EmbedComplete complete = 2;
    EmbedError error = 3;
  }
}

message EmbedComplete {
  repeated float embedding = 1;
  int32 prompt_tokens = 2;
  int32 cached_tokens = 3;

  // Additional metadata
  int32 embedding_dim = 4;

  // For batch embeddings
311
  repeated Embedding batch_embeddings = 5;
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
}

message Embedding {
  repeated float values = 1;
  int32 index = 2;
}

message EmbedError {
  string message = 1;
  string code = 2;
  string details = 3;
}

// =====================
// Management Operations
// =====================

message HealthCheckRequest {
330
331
  // Input for health test generation (must be tokenized)
  TokenizedInput tokenized = 1;
332
333
334
335
}

message HealthCheckResponse {
  bool healthy = 1;
336
  string message = 2;
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
}

message AbortRequest {
  string request_id = 1;
  string reason = 2;
}

message AbortResponse {
  bool success = 1;
  string message = 2;
}


// =====================
// Additional Operations (Future)
// =====================

// Load LoRA adapter
message LoadLoRARequest {
  string adapter_id = 1;
  string adapter_path = 2;
  int32 rank = 3;
}

message LoadLoRAResponse {
  bool success = 1;
  string adapter_id = 2;
  string message = 3;
}

// Unload LoRA adapter
message UnloadLoRARequest {
  string adapter_id = 1;
}

message UnloadLoRAResponse {
  bool success = 1;
  string message = 2;
}

// Update weights
message UpdateWeightsRequest {
  oneof source {
    string disk_path = 1;
    bytes tensor_data = 2;
    string remote_url = 3;
  }
  string weight_name = 4;
}

message UpdateWeightsResponse {
  bool success = 1;
  string message = 2;
}

// Get internal state for debugging
message GetInternalStateRequest {
  repeated string state_keys = 1;
}

message GetInternalStateResponse {
  google.protobuf.Struct state = 1;
}

// Set internal state for testing
message SetInternalStateRequest {
  google.protobuf.Struct state = 1;
}

message SetInternalStateResponse {
  bool success = 1;
  string message = 2;
}
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462

// =====================
// Model and Server Info
// =====================

// Get model information
message GetModelInfoRequest {}

message GetModelInfoResponse {
  string model_path = 1;
  string tokenizer_path = 2;
  bool is_generation = 3;
  string preferred_sampling_params = 4;  // JSON string or empty
  string weight_version = 5;
  string served_model_name = 6;
  int32 max_context_length = 7;
  int32 vocab_size = 8;
  bool supports_vision = 9;
  string model_type = 10;
  repeated int32 eos_token_ids = 11;
  int32 pad_token_id = 12;
  int32 bos_token_id = 13;
  int32 max_req_input_len = 14;
}

// Get server information
message GetServerInfoRequest {}

message GetServerInfoResponse {
  // Server configuration (as structured data)
  google.protobuf.Struct server_args = 1;

  // Scheduler metrics (from scheduler initialization)
  google.protobuf.Struct scheduler_info = 2;

  // Runtime state
  int32 active_requests = 3;
  bool is_paused = 4;
  double last_receive_timestamp = 5;
  double uptime_seconds = 6;

  // Version info
  string sglang_version = 7;

  // Server metadata
  string server_type = 8;  // "grpc"
  google.protobuf.Timestamp start_time = 9;

  // Note: internal_states not provided in gRPC mode
  // Scheduler-side metrics (memory usage, throughput) require
  // bidirectional communicator infrastructure not available in gRPC.
  // Use HTTP /get_server_info if scheduler internal state is needed.
}