generate.proto 6.33 KB
Newer Older
OlivierDehaene's avatar
OlivierDehaene committed
1
2
3
4
5
syntax = "proto3";

package generate.v3;

service TextGenerationService {
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  /// Model Info
  rpc Info(InfoRequest) returns (InfoResponse) {}
  /// Service discovery
  rpc ServiceDiscovery(ServiceDiscoveryRequest)
      returns (ServiceDiscoveryResponse) {}
  /// Empties batch cache
  rpc ClearCache(ClearCacheRequest) returns (ClearCacheResponse);
  /// Remove requests from a cached batch
  rpc FilterBatch(FilterBatchRequest) returns (FilterBatchResponse);
  /// Warmup the model and compute max cache size
  rpc Warmup(WarmupRequest) returns (WarmupResponse);
  /// Prefill batch and decode first token
  rpc Prefill(PrefillRequest) returns (PrefillResponse);
  /// Decode token for a list of prefilled batches
  rpc Decode(DecodeRequest) returns (DecodeResponse);
  /// Health check
  rpc Health(HealthRequest) returns (HealthResponse);
OlivierDehaene's avatar
OlivierDehaene committed
23
24
25
26
27
28
29
30
31
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
32
33
34
35
36
  bool requires_padding = 1;
  string dtype = 2;
  string device_type = 3;
  optional uint32 window_size = 4;
  uint32 speculate = 5;
OlivierDehaene's avatar
OlivierDehaene committed
37
38
39
40
41
42
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
43
44
  /// Other shards urls
  repeated string urls = 1;
OlivierDehaene's avatar
OlivierDehaene committed
45
46
47
}

message ClearCacheRequest {
48
49
  /// Optional batch id
  optional uint64 id = 1;
OlivierDehaene's avatar
OlivierDehaene committed
50
51
52
53
54
55
}

/// Empty response
message ClearCacheResponse {}

message Image {
56
57
  /// Binary image data.
  bytes data = 1;
OlivierDehaene's avatar
OlivierDehaene committed
58

59
60
  /// Image MIME type.
  string mimetype = 2;
OlivierDehaene's avatar
OlivierDehaene committed
61
62
63
}

message InputChunk {
64
65
66
67
68
69
  oneof chunk {
    /// Plain text data
    string text = 1;
    /// Image data
    Image image = 2;
  }
OlivierDehaene's avatar
OlivierDehaene committed
70
71
}

72
message Input { repeated InputChunk chunks = 1; }
OlivierDehaene's avatar
OlivierDehaene committed
73
74

enum GrammarType {
75
76
77
  GRAMMAR_TYPE_NONE = 0;
  GRAMMAR_TYPE_JSON = 1;
  GRAMMAR_TYPE_REGEX = 2;
OlivierDehaene's avatar
OlivierDehaene committed
78
79
80
}

message NextTokenChooserParameters {
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  /// exponential scaling output probability distribution
  float temperature = 1;
  /// restricting to the k highest probability elements
  uint32 top_k = 2;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float top_p = 3;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float typical_p = 4;
  /// apply sampling on the logits
  bool do_sample = 5;
  /// random seed for sampling
  uint64 seed = 6;
  /// repetition penalty
  float repetition_penalty = 7;
  /// frequency penalty
  float frequency_penalty = 9;
  /// token watermarking using "A Watermark for Large Language Models"
  bool watermark = 8;
  /// grammar (applied if not empty)
  string grammar = 10;
  /// grammar type
  GrammarType grammar_type = 11;
OlivierDehaene's avatar
OlivierDehaene committed
103
104
105
}

message StoppingCriteriaParameters {
106
107
108
109
110
111
112
  /// Maximum number of generated tokens
  uint32 max_new_tokens = 1;
  /// Optional stopping sequences
  repeated string stop_sequences = 2;
  /// Ignore end of sequence token
  /// used for benchmarking
  bool ignore_eos_token = 3;
OlivierDehaene's avatar
OlivierDehaene committed
113
114
115
}

message Request {
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
  /// Request ID
  uint64 id = 1;
  /// The generation context as chunks
  Input input_chunks = 8;
  /// The generation context, stringified input_chunks
  string inputs = 2;
  /// Context truncation
  uint32 truncate = 3;
  /// Next Token Chooser Parameters
  NextTokenChooserParameters parameters = 4;
  /// Stopping Criteria Parameters
  StoppingCriteriaParameters stopping_parameters = 5;
  /// Return prefill logprobs
  bool prefill_logprobs = 6;
  /// Return most likely n tokens
  uint32 top_n_tokens = 7;
  /// Paged attention blocks
  repeated uint32 blocks = 9;
  /// Paged attention slots
  repeated uint32 slots = 10;
  /// LORA adapter index
  optional string adapter_id = 11;
  /// Prefix length that can be retrieved from the KV cache.
  uint32 prefix_len = 12;
140
141
  /// Context truncation
  bool add_special_tokens = 13;
OlivierDehaene's avatar
OlivierDehaene committed
142
143
144
}

message Batch {
145
146
147
148
149
150
151
152
153
154
  /// Batch ID
  uint64 id = 1;
  /// Individual requests
  repeated Request requests = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
  /// Maximum number of Paged Attention blocks
  uint32 max_blocks = 5;
OlivierDehaene's avatar
OlivierDehaene committed
155
156
157
}

message CachedBatch {
158
159
160
161
162
163
164
165
  /// Batch ID
  uint64 id = 1;
  /// Individual requests ids
  repeated uint64 request_ids = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
OlivierDehaene's avatar
OlivierDehaene committed
166
167
168
}

enum FinishReason {
169
170
171
  FINISH_REASON_LENGTH = 0;
  FINISH_REASON_EOS_TOKEN = 1;
  FINISH_REASON_STOP_SEQUENCE = 2;
OlivierDehaene's avatar
OlivierDehaene committed
172
173
174
}

message GeneratedText {
175
176
177
178
179
180
181
182
  /// Output
  string text = 1;
  /// Number of generated tokens
  uint32 generated_tokens = 2;
  /// Finish reason
  FinishReason finish_reason = 3;
  /// Seed
  optional uint64 seed = 4;
OlivierDehaene's avatar
OlivierDehaene committed
183
184
185
}

message Tokens {
186
187
188
189
190
191
192
193
  /// Token IDs
  repeated uint32 ids = 1;
  /// Logprobs
  repeated float logprobs = 2;
  /// tokens
  repeated string texts = 3;
  /// special
  repeated bool is_special = 4;
OlivierDehaene's avatar
OlivierDehaene committed
194
195
196
}

message Generation {
197
198
199
200
201
202
203
204
205
  /// Request ID
  uint64 request_id = 1;
  /// Prefill tokens (optional)
  Tokens prefill_tokens = 2;
  Tokens tokens = 3;
  /// Complete generated text
  optional GeneratedText generated_text = 4;
  /// Top tokens
  repeated Tokens top_tokens = 5;
OlivierDehaene's avatar
OlivierDehaene committed
206
207
208
}

message FilterBatchRequest {
209
210
211
212
  /// Batch ID
  uint64 batch_id = 1;
  /// Requests to keep
  repeated uint64 request_ids = 2;
OlivierDehaene's avatar
OlivierDehaene committed
213
214
215
}

message FilterBatchResponse {
216
217
  /// Filtered Batch (cached)
  CachedBatch batch = 1;
OlivierDehaene's avatar
OlivierDehaene committed
218
219
220
}

message PrefillRequest {
221
222
  /// Batch
  Batch batch = 1;
OlivierDehaene's avatar
OlivierDehaene committed
223
224
225
}

message PrefillResponse {
226
227
228
229
230
231
232
233
234
235
  /// Generation
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
OlivierDehaene's avatar
OlivierDehaene committed
236
237
238
}

message DecodeRequest {
239
240
  /// Cached batches
  repeated CachedBatch batches = 1;
OlivierDehaene's avatar
OlivierDehaene committed
241
242
243
}

message DecodeResponse {
244
245
246
247
248
249
250
251
252
253
254
255
  /// Decodes
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
  /// Concatenate elapsed time in nanoseconds
  optional uint64 concat_ns = 6;
OlivierDehaene's avatar
OlivierDehaene committed
256
257
258
}

message WarmupRequest {
259
260
261
262
263
  /// Batch to warmup on
  Batch batch = 1;
  uint32 max_input_length = 2;
  uint32 max_prefill_tokens = 3;
  uint32 max_total_tokens = 4;
OlivierDehaene's avatar
OlivierDehaene committed
264
265
266
}

message WarmupResponse {
267
268
  /// Maximum number of tokens supported by the model
  optional uint32 max_supported_total_tokens = 1;
OlivierDehaene's avatar
OlivierDehaene committed
269
}