generate.proto 6.86 KB
Newer Older
jixx's avatar
init  
jixx committed
1
2
3
4
5
syntax = "proto3";

package generate.v3;

service TextGenerationService {
jixx's avatar
jixx committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  /// Model Info
  rpc Info(InfoRequest) returns (InfoResponse) {}
  /// Service discovery
  rpc ServiceDiscovery(ServiceDiscoveryRequest)
      returns (ServiceDiscoveryResponse) {}
  /// Empties batch cache
  rpc ClearCache(ClearCacheRequest) returns (ClearCacheResponse);
  /// Remove requests from a cached batch
  rpc FilterBatch(FilterBatchRequest) returns (FilterBatchResponse);
  /// Warmup the model and compute max cache size
  rpc Warmup(WarmupRequest) returns (WarmupResponse);
  /// Prefill batch and decode first token
  rpc Prefill(PrefillRequest) returns (PrefillResponse);
  /// Decode token for a list of prefilled batches
  rpc Decode(DecodeRequest) returns (DecodeResponse);
  /// Health check
  rpc Health(HealthRequest) returns (HealthResponse);
jixx's avatar
init  
jixx committed
23
24
25
26
27
28
29
30
31
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
jixx's avatar
jixx committed
32
33
34
35
36
37
38
39
40
  bool requires_padding = 1;
  string dtype = 2;
  string device_type = 3;
  optional uint32 window_size = 4;
  uint32 speculate = 5;
  bool support_chunking = 6;
  bool use_prefix_caching = 7;
  string attention_impl = 8;
  uint32 block_size = 9;
jixx's avatar
init  
jixx committed
41
42
43
44
45
46
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
jixx's avatar
jixx committed
47
48
  /// Other shards urls
  repeated string urls = 1;
jixx's avatar
init  
jixx committed
49
50
51
}

message ClearCacheRequest {
jixx's avatar
jixx committed
52
53
  /// Optional batch id
  optional uint64 id = 1;
jixx's avatar
init  
jixx committed
54
55
56
57
58
59
}

/// Empty response
message ClearCacheResponse {}

message Image {
jixx's avatar
jixx committed
60
61
  /// Binary image data.
  bytes data = 1;
jixx's avatar
init  
jixx committed
62

jixx's avatar
jixx committed
63
64
  /// Image MIME type.
  string mimetype = 2;
jixx's avatar
init  
jixx committed
65
66
67
}

message InputChunk {
jixx's avatar
jixx committed
68
69
70
71
72
73
  oneof chunk {
    /// Plain text data
    string text = 1;
    /// Image data
    Image image = 2;
  }
jixx's avatar
init  
jixx committed
74
75
}

jixx's avatar
jixx committed
76
message Input { repeated InputChunk chunks = 1; }
jixx's avatar
init  
jixx committed
77
78

enum GrammarType {
jixx's avatar
jixx committed
79
80
81
  GRAMMAR_TYPE_NONE = 0;
  GRAMMAR_TYPE_JSON = 1;
  GRAMMAR_TYPE_REGEX = 2;
jixx's avatar
init  
jixx committed
82
83
84
}

message NextTokenChooserParameters {
jixx's avatar
jixx committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  /// exponential scaling output probability distribution
  float temperature = 1;
  /// restricting to the k highest probability elements
  uint32 top_k = 2;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float top_p = 3;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float typical_p = 4;
  /// apply sampling on the logits
  bool do_sample = 5;
  /// random seed for sampling
  uint64 seed = 6;
  /// repetition penalty
  float repetition_penalty = 7;
  /// frequency penalty
  float frequency_penalty = 9;
  /// token watermarking using "A Watermark for Large Language Models"
  bool watermark = 8;
  /// grammar (applied if not empty)
  string grammar = 10;
  /// grammar type
  GrammarType grammar_type = 11;
jixx's avatar
init  
jixx committed
107
108
109
}

message StoppingCriteriaParameters {
jixx's avatar
jixx committed
110
111
112
113
114
115
116
  /// Maximum number of generated tokens
  uint32 max_new_tokens = 1;
  /// Optional stopping sequences
  repeated string stop_sequences = 2;
  /// Ignore end of sequence token
  /// used for benchmarking
  bool ignore_eos_token = 3;
jixx's avatar
init  
jixx committed
117
118
119
}

message Request {
jixx's avatar
jixx committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
  /// Request ID
  uint64 id = 1;
  /// The generation context as chunks
  Input input_chunks = 8;
  /// The generation context, stringified input_chunks
  string inputs = 2;
  /// Context truncation
  uint32 truncate = 3;
  /// Next Token Chooser Parameters
  NextTokenChooserParameters parameters = 4;
  /// Stopping Criteria Parameters
  StoppingCriteriaParameters stopping_parameters = 5;
  /// Return prefill logprobs
  bool prefill_logprobs = 6;
  /// Return most likely n tokens
  uint32 top_n_tokens = 7;
  /// Paged attention blocks
  repeated uint32 blocks = 9;
  /// Paged attention slots
  repeated uint32 slots = 10;
  /// LORA adapter index
  optional string adapter_id = 11;
  /// Tokens that can be retrieved from the KV cache.
  /// This value is set for the first prefill and never reset
  uint32 cache_len = 12;
  /// Context truncation
  bool add_special_tokens = 13;
  /// Chunk of tokens that must be computed for the first prefill
  /// This value is set for the first prefill and never reset
  optional uint32 chunk_len = 14;
jixx's avatar
init  
jixx committed
150
151
152
}

message Batch {
jixx's avatar
jixx committed
153
154
155
156
157
158
159
160
161
162
  /// Batch ID
  uint64 id = 1;
  /// Individual requests
  repeated Request requests = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
  /// Maximum number of Paged Attention blocks
  uint32 max_blocks = 5;
jixx's avatar
init  
jixx committed
163
164
165
}

message CachedBatch {
jixx's avatar
jixx committed
166
167
168
169
170
171
172
173
174
175
  /// Batch ID
  uint64 id = 1;
  /// Individual requests ids
  repeated uint64 request_ids = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
  /// Number of tokens in the next forward
  uint32 current_tokens = 5;
jixx's avatar
init  
jixx committed
176
177
178
}

enum FinishReason {
jixx's avatar
jixx committed
179
180
181
  FINISH_REASON_LENGTH = 0;
  FINISH_REASON_EOS_TOKEN = 1;
  FINISH_REASON_STOP_SEQUENCE = 2;
jixx's avatar
init  
jixx committed
182
183
184
}

message GeneratedText {
jixx's avatar
jixx committed
185
186
187
188
189
190
191
192
  /// Output
  string text = 1;
  /// Number of generated tokens
  uint32 generated_tokens = 2;
  /// Finish reason
  FinishReason finish_reason = 3;
  /// Seed
  optional uint64 seed = 4;
jixx's avatar
init  
jixx committed
193
194
195
}

message Tokens {
jixx's avatar
jixx committed
196
197
198
199
200
201
202
203
  /// Token IDs
  repeated uint32 ids = 1;
  /// Logprobs
  repeated float logprobs = 2;
  /// tokens
  repeated string texts = 3;
  /// special
  repeated bool is_special = 4;
jixx's avatar
init  
jixx committed
204
205
206
}

message Generation {
jixx's avatar
jixx committed
207
208
209
210
211
212
213
214
215
  /// Request ID
  uint64 request_id = 1;
  /// Prefill tokens (optional)
  Tokens prefill_tokens = 2;
  Tokens tokens = 3;
  /// Complete generated text
  optional GeneratedText generated_text = 4;
  /// Top tokens
  repeated Tokens top_tokens = 5;
jixx's avatar
init  
jixx committed
216
217
218
}

message FilterBatchRequest {
jixx's avatar
jixx committed
219
220
221
222
  /// Batch ID
  uint64 batch_id = 1;
  /// Requests to keep
  repeated uint64 request_ids = 2;
jixx's avatar
init  
jixx committed
223
224
225
}

message FilterBatchResponse {
jixx's avatar
jixx committed
226
227
  /// Filtered Batch (cached)
  CachedBatch batch = 1;
jixx's avatar
init  
jixx committed
228
229
230
}

message PrefillRequest {
jixx's avatar
jixx committed
231
232
233
234
  /// Batch
  Batch batch = 1;
  /// Optional cached batch
  CachedBatch cached_batch = 2;
jixx's avatar
init  
jixx committed
235
236
237
}

message PrefillResponse {
jixx's avatar
jixx committed
238
239
240
241
242
243
244
245
246
247
248
249
  /// Generation
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
  /// Concatenate elapsed time in nanoseconds
  optional uint64 concat_ns = 6;
jixx's avatar
init  
jixx committed
250
251
252
}

message DecodeRequest {
jixx's avatar
jixx committed
253
254
  /// Cached batches
  repeated CachedBatch batches = 1;
jixx's avatar
init  
jixx committed
255
256
257
}

message DecodeResponse {
jixx's avatar
jixx committed
258
259
260
261
262
263
264
265
266
267
268
269
  /// Decodes
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
  /// Concatenate elapsed time in nanoseconds
  optional uint64 concat_ns = 6;
jixx's avatar
init  
jixx committed
270
271
272
}

message WarmupRequest {
jixx's avatar
jixx committed
273
274
275
276
277
  /// Batch to warmup on
  Batch batch = 1;
  uint32 max_input_length = 2;
  uint32 max_prefill_tokens = 3;
  uint32 max_total_tokens = 4;
jixx's avatar
init  
jixx committed
278
279
280
}

message WarmupResponse {
jixx's avatar
jixx committed
281
282
  /// Maximum number of tokens supported by the model
  optional uint32 max_supported_total_tokens = 1;
jixx's avatar
init  
jixx committed
283
}