generate.proto 3.51 KB
Newer Older
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
1
2
3
4
syntax = "proto3";

package generate.v1;

Olivier Dehaene's avatar
Olivier Dehaene committed
5
service TextGenerationService {
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
6
    /// Service discovery
Olivier Dehaene's avatar
Olivier Dehaene committed
7
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
8
    /// Empties batch cache
Olivier Dehaene's avatar
Olivier Dehaene committed
9
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
10
11
12
13
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
14
15
}

Olivier Dehaene's avatar
Olivier Dehaene committed
16
17
18
/// Empty request
message ServiceDiscoveryRequest {}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
19
message ServiceDiscoveryResponse {
Olivier Dehaene's avatar
Olivier Dehaene committed
20
    /// Other shards urls
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
21
22
23
    repeated string urls = 1;
}

24
25
26
27
message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}
Olivier Dehaene's avatar
Olivier Dehaene committed
28
29
30
31

/// Empty response
message ClearCacheResponse {}

OlivierDehaene's avatar
OlivierDehaene committed
32
message NextTokenChooserParameters {
33
    /// exponential scaling output probability distribution
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
34
    float temperature = 1;
35
    /// restricting to the k highest probability elements
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
36
    uint32 top_k = 2;
37
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
38
    float top_p = 3;
39
40
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
41
    /// apply sampling on the logits
42
    bool do_sample = 5;
43
    /// random seed for sampling
44
    uint64 seed = 6;
45
    /// repetition penalty
46
    float repetition_penalty = 7;
47
    /// token watermarking using "A Watermark for Large Language Models"
48
    bool watermark = 8;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
49
50
}

51
52
53
54
55
message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
56
57
58
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
59
60
}

Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
61
62
63
64
65
message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
OlivierDehaene's avatar
OlivierDehaene committed
66
    /// Next Token Chooser Parameters
67
    NextTokenChooserParameters parameters = 3;
68
    /// Stopping Criteria Parameters
69
    StoppingCriteriaParameters stopping_parameters = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
70
71
72
73
74
75
76
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
Olivier Dehaene's avatar
Olivier Dehaene committed
77
78
    /// Batch size (==len(requests))
    uint32 size = 3;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
79
80
}

81
82
83
84
85
86
enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

Olivier Dehaene's avatar
Olivier Dehaene committed
87
message GeneratedText {
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
88
    /// Output
89
    string text = 1;
90
    /// Number of generated tokens
91
    uint32 generated_tokens = 2;
92
    /// Finish reason
93
    FinishReason finish_reason = 3;
94
    /// Seed
95
    optional uint64 seed = 4;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
96
97
}

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
message PrefillTokens {
    /// Prefill Token IDs
    repeated uint32 ids = 1;
    /// Prefill Logprobs
    repeated float logprobs = 2;
    /// Prefill tokens
    repeated string texts = 3;
}

message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    PrefillTokens prefill_tokens = 2;
    /// Token ID
    uint32 token_id = 3;
    /// Logprob
    float token_logprob = 4;
    /// Text
    string token_text = 5;
118
119
    /// Is it a special token
    bool token_is_special = 6;
120
    /// Complete generated text
121
    GeneratedText generated_text = 7;
122
123
124
}

message PrefillRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
125
126
    /// Batch
    Batch batch = 1;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
127
128
}

129
130
131
message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
132
133
    /// Next batch (cached)
    optional Batch batch = 2;
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
134
135
}

136
message DecodeRequest {
Olivier Dehaene's avatar
Olivier Dehaene committed
137
138
139
    /// Cached batches
    repeated Batch batches = 1;
}
Olivier Dehaene's avatar
Init  
Olivier Dehaene committed
140

141
142
143
message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
Olivier Dehaene's avatar
Olivier Dehaene committed
144
145
    /// Next batch (cached)
    optional Batch batch = 2;
146
}