"vscode:/vscode.git/clone" did not exist on "62f8eb48b1fa121d185ba3226f093d8f11cc9183"
generate.proto 5.4 KB
Newer Older
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
1
2
3
4
syntax = "proto3";

package generate.v1;

Olivier Dehaene's avatar
Olivier Dehaene committed
5
service TextGenerationService {
6
7
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
8
    /// Service discovery
Olivier Dehaene's avatar
Olivier Dehaene committed
9
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
10
    /// Empties batch cache
Olivier Dehaene's avatar
Olivier Dehaene committed
11
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
12
13
    /// Remove requests from a cached batch
    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
14
15
    /// Warmup the model and compute max cache size
    rpc Warmup (WarmupRequest) returns (WarmupResponse);
16
17
18
19
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
20
21
    /// Health check
    rpc Health (HealthRequest) returns (HealthResponse);
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
22
23
}

24
25
26
message HealthRequest {}
message HealthResponse {}

27
28
29
30
31
32
33
/// Empty request
message InfoRequest {}

message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
34
    optional uint32 window_size = 4;
35
36
}

Olivier Dehaene's avatar
Olivier Dehaene committed
37
38
39
/// Empty request
message ServiceDiscoveryRequest {}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
40
message ServiceDiscoveryResponse {
Olivier Dehaene's avatar
Olivier Dehaene committed
41
    /// Other shards urls
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
42
43
44
    repeated string urls = 1;
}

45
46
47
48
message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}
Olivier Dehaene's avatar
Olivier Dehaene committed
49
50
51
52

/// Empty response
message ClearCacheResponse {}

OlivierDehaene's avatar
OlivierDehaene committed
53
message NextTokenChooserParameters {
54
    /// exponential scaling output probability distribution
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
55
    float temperature = 1;
56
    /// restricting to the k highest probability elements
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
57
    uint32 top_k = 2;
58
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
59
    float top_p = 3;
60
61
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
62
    /// apply sampling on the logits
63
    bool do_sample = 5;
64
    /// random seed for sampling
65
    uint64 seed = 6;
66
    /// repetition penalty
67
    float repetition_penalty = 7;
68
    /// token watermarking using "A Watermark for Large Language Models"
69
    bool watermark = 8;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
70
71
}

72
73
74
75
76
message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
77
78
79
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
80
81
}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
82
83
84
85
86
message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
87
88
    /// Context truncation
    uint32 truncate = 3;
OlivierDehaene's avatar
OlivierDehaene committed
89
    /// Next Token Chooser Parameters
90
    NextTokenChooserParameters parameters = 4;
91
    /// Stopping Criteria Parameters
92
    StoppingCriteriaParameters stopping_parameters = 5;
93
94
    /// Return prefill logprobs
    bool prefill_logprobs = 6;
Nicolas Patry's avatar
Nicolas Patry committed
95
96
    /// Return most likely n tokens
    uint32 top_n_tokens = 7;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
97
98
99
100
101
102
103
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
Olivier Dehaene's avatar
Olivier Dehaene committed
104
105
    /// Batch size (==len(requests))
    uint32 size = 3;
106
107
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
108
109
}

110
111
112
113
114
115
116
117
118
119
120
message CachedBatch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests ids
    repeated uint64 request_ids = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

121
122
123
124
125
126
enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

Olivier Dehaene's avatar
Olivier Dehaene committed
127
message GeneratedText {
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
128
    /// Output
129
    string text = 1;
130
    /// Number of generated tokens
131
    uint32 generated_tokens = 2;
132
    /// Finish reason
133
    FinishReason finish_reason = 3;
134
    /// Seed
135
    optional uint64 seed = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
136
137
}

138
139
140
141
142
143
144
145
146
message PrefillTokens {
    /// Prefill Token IDs
    repeated uint32 ids = 1;
    /// Prefill Logprobs
    repeated float logprobs = 2;
    /// Prefill tokens
    repeated string texts = 3;
}

Nicolas Patry's avatar
Nicolas Patry committed
147
148
149
150
151
152
153
154
155
156
157
message TopTokens {
    /// Top Token IDs
    repeated uint32 ids = 1;
    /// Top Logprobs
    repeated float logprobs = 2;
    /// Top Token Texts
    repeated string texts = 3;
    /// If the tokens are special
    repeated bool is_special = 6;
}

158
159
160
161
162
163
164
165
166
167
168
message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    PrefillTokens prefill_tokens = 2;
    /// Token ID
    uint32 token_id = 3;
    /// Logprob
    float token_logprob = 4;
    /// Text
    string token_text = 5;
169
170
    /// Is it a special token
    bool token_is_special = 6;
171
    /// Complete generated text
172
    optional GeneratedText generated_text = 7;
Nicolas Patry's avatar
Nicolas Patry committed
173
174
    /// Top tokens
    TopTokens top_tokens = 8;
175
176
}

177
178
179
180
message FilterBatchRequest {
    /// Batch ID
    uint64 batch_id = 1;
    /// Requests to keep
181
    repeated uint64 request_ids = 2;
182
183
184
185
}

message FilterBatchResponse {
    /// Filtered Batch (cached)
186
    CachedBatch batch = 1;
187
188
189
}


190
message PrefillRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
191
192
    /// Batch
    Batch batch = 1;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
193
194
}

195
196
197
message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
198
    /// Next batch (cached)
199
    optional CachedBatch batch = 2;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
200
201
}

202
message DecodeRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
203
    /// Cached batches
204
    repeated CachedBatch batches = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
205
}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
206

207
208
209
message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
210
    /// Next batch (cached)
211
    optional CachedBatch batch = 2;
212
}
213
214
215
216
217
218
219

message WarmupRequest {
    /// Batch to warmup on
    Batch batch = 1;
}

/// Empty response
220
221
222
223
message WarmupResponse {
    /// Maximum number of tokens supported by the model
    optional uint32 max_supported_total_tokens = 1;
}