generate.proto 5.88 KB
Newer Older
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
1
2
syntax = "proto3";

Nicolas Patry's avatar
Nicolas Patry committed
3
package generate.v2;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
4

Olivier Dehaene's avatar
Olivier Dehaene committed
5
service TextGenerationService {
6
7
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
8
    /// Service discovery
Olivier Dehaene's avatar
Olivier Dehaene committed
9
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
10
    /// Empties batch cache
Olivier Dehaene's avatar
Olivier Dehaene committed
11
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
12
13
    /// Remove requests from a cached batch
    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
14
15
    /// Warmup the model and compute max cache size
    rpc Warmup (WarmupRequest) returns (WarmupResponse);
16
17
18
19
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
20
21
    /// Health check
    rpc Health (HealthRequest) returns (HealthResponse);
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
22
23
}

24
25
26
message HealthRequest {}
message HealthResponse {}

27
28
29
30
31
32
33
/// Empty request
message InfoRequest {}

message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
34
    optional uint32 window_size = 4;
Nicolas Patry's avatar
Nicolas Patry committed
35
    uint32 speculate = 5;
36
37
}

Olivier Dehaene's avatar
Olivier Dehaene committed
38
39
40
/// Empty request
message ServiceDiscoveryRequest {}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
41
message ServiceDiscoveryResponse {
Olivier Dehaene's avatar
Olivier Dehaene committed
42
    /// Other shards urls
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
43
44
45
    repeated string urls = 1;
}

46
47
48
49
message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}
Olivier Dehaene's avatar
Olivier Dehaene committed
50
51
52
53

/// Empty response
message ClearCacheResponse {}

drbh's avatar
drbh committed
54
55
56
57
58
59
enum GrammarType {
    GRAMMAR_TYPE_NONE = 0;
    GRAMMAR_TYPE_JSON = 1;
    GRAMMAR_TYPE_REGEX = 2;
}

OlivierDehaene's avatar
OlivierDehaene committed
60
message NextTokenChooserParameters {
61
    /// exponential scaling output probability distribution
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
62
    float temperature = 1;
63
    /// restricting to the k highest probability elements
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
64
    uint32 top_k = 2;
65
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
66
    float top_p = 3;
67
68
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
69
    /// apply sampling on the logits
70
    bool do_sample = 5;
71
    /// random seed for sampling
72
    uint64 seed = 6;
73
    /// repetition penalty
74
    float repetition_penalty = 7;
75
76
    /// frequency penalty
    float frequency_penalty = 9;
77
    /// token watermarking using "A Watermark for Large Language Models"
78
    bool watermark = 8;
drbh's avatar
drbh committed
79
80
81
82
    /// grammar (applied if not empty)
    string grammar = 10;
    /// grammar type
    GrammarType grammar_type = 11;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
83
84
}

85
86
87
88
89
message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
90
91
92
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
93
94
}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
95
96
97
message Request {
    /// Request ID
    uint64 id = 1;
OlivierDehaene's avatar
OlivierDehaene committed
98
    /// The generation context
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
99
    string inputs = 2;
100
101
    /// Context truncation
    uint32 truncate = 3;
OlivierDehaene's avatar
OlivierDehaene committed
102
    /// Next Token Chooser Parameters
103
    NextTokenChooserParameters parameters = 4;
104
    /// Stopping Criteria Parameters
105
    StoppingCriteriaParameters stopping_parameters = 5;
106
107
    /// Return prefill logprobs
    bool prefill_logprobs = 6;
Nicolas Patry's avatar
Nicolas Patry committed
108
109
    /// Return most likely n tokens
    uint32 top_n_tokens = 7;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
110
111
112
113
114
115
116
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
Olivier Dehaene's avatar
Olivier Dehaene committed
117
118
    /// Batch size (==len(requests))
    uint32 size = 3;
119
120
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
121
122
}

123
124
125
126
127
128
129
130
131
132
133
message CachedBatch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests ids
    repeated uint64 request_ids = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

134
135
136
137
138
139
enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

Olivier Dehaene's avatar
Olivier Dehaene committed
140
message GeneratedText {
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
141
    /// Output
142
    string text = 1;
143
    /// Number of generated tokens
144
    uint32 generated_tokens = 2;
145
    /// Finish reason
146
    FinishReason finish_reason = 3;
147
    /// Seed
148
    optional uint64 seed = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
149
150
}

Nicolas Patry's avatar
Nicolas Patry committed
151
152
message Tokens {
    /// Token IDs
153
    repeated uint32 ids = 1;
Nicolas Patry's avatar
Nicolas Patry committed
154
    /// Logprobs
155
    repeated float logprobs = 2;
Nicolas Patry's avatar
Nicolas Patry committed
156
    /// tokens
157
    repeated string texts = 3;
Nicolas Patry's avatar
Nicolas Patry committed
158
159
    /// special
    repeated bool is_special = 4;
Nicolas Patry's avatar
Nicolas Patry committed
160
161
}

162
163
164
165
message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
Nicolas Patry's avatar
Nicolas Patry committed
166
167
    Tokens prefill_tokens = 2;
    Tokens tokens = 3;
168
    /// Complete generated text
Nicolas Patry's avatar
Nicolas Patry committed
169
    optional GeneratedText generated_text = 4;
Nicolas Patry's avatar
Nicolas Patry committed
170
    /// Top tokens
Nicolas Patry's avatar
Nicolas Patry committed
171
    repeated Tokens top_tokens = 5;
172
173
}

174
175
176
177
message FilterBatchRequest {
    /// Batch ID
    uint64 batch_id = 1;
    /// Requests to keep
178
    repeated uint64 request_ids = 2;
179
180
181
182
}

message FilterBatchResponse {
    /// Filtered Batch (cached)
183
    CachedBatch batch = 1;
184
185
186
}


187
message PrefillRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
188
189
    /// Batch
    Batch batch = 1;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
190
191
}

192
193
194
message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
195
    /// Next batch (cached)
196
    optional CachedBatch batch = 2;
197
198
199
200
201
202
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
203
204
}

205
message DecodeRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
206
    /// Cached batches
207
    repeated CachedBatch batches = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
208
}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
209

210
211
212
message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
213
    /// Next batch (cached)
214
    optional CachedBatch batch = 2;
215
216
217
218
219
220
221
222
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
    /// Concatenate elapsed time in nanoseconds
    optional uint64 concat_ns = 6;
223
}
224
225
226
227

message WarmupRequest {
    /// Batch to warmup on
    Batch batch = 1;
228
229
230
    uint32 max_input_length = 2;
    uint32 max_prefill_tokens = 3;
    uint32 max_total_tokens = 4;
231
232
}

233
234
235
236
message WarmupResponse {
    /// Maximum number of tokens supported by the model
    optional uint32 max_supported_total_tokens = 1;
}