validation.rs 13.8 KB
Newer Older
1
use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
2
/// Payload validation logic
3
use crate::{GenerateParameters, GenerateRequest};
4
use rand::{thread_rng, Rng};
5
use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use thiserror::Error;
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use tokenizers::tokenizer::Tokenizer;
8
use tokenizers::TruncationDirection;
9
use tokio::sync::oneshot;
10
use tracing::{instrument, Span};
Olivier Dehaene's avatar
Olivier Dehaene committed
11

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
12
/// Validation
Olivier Dehaene's avatar
Olivier Dehaene committed
13
#[derive(Debug, Clone)]
Olivier Dehaene's avatar
Olivier Dehaene committed
14
pub struct Validation {
15
    /// Validation parameters
16
    max_best_of: usize,
17
18
19
20
21
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
    /// Channel to communicate with the background tokenization task
    sender: Option<flume::Sender<TokenizerRequest>>,
Olivier Dehaene's avatar
Olivier Dehaene committed
22
23
24
}

impl Validation {
25
26
    pub(crate) fn new(
        workers: usize,
27
        tokenizer: Option<Tokenizer>,
28
        max_best_of: usize,
29
30
31
32
        max_stop_sequences: usize,
        max_input_length: usize,
        max_total_tokens: usize,
    ) -> Self {
33
34
35
        if max_input_length >= max_total_tokens {
            panic!("`max_input_length` must be < `max_total_tokens`");
        }
Olivier Dehaene's avatar
Olivier Dehaene committed
36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
        // If we have a fast tokenizer
        let sender = if let Some(tokenizer) = tokenizer {
            // Create channel
            let (validation_sender, validation_receiver) = flume::unbounded();

            // Create workers
            for _ in 0..workers {
                let tokenizer_clone = tokenizer.clone();
                let receiver_clone = validation_receiver.clone();

                // Spawn worker
                tokio::task::spawn_blocking(move || {
                    tokenizer_worker(tokenizer_clone, receiver_clone)
                });
            }
            Some(validation_sender)
        } else {
            None
        };

        Self {
            max_best_of,
            sender,
60
            max_stop_sequences,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
61
            max_input_length,
62
            max_total_tokens,
63
64
        }
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
65

66
67
68
69
70
71
    #[instrument(skip_all)]
    async fn validate_input(
        &self,
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: u32,
72
    ) -> Result<(String, usize), ValidationError> {
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
            let (response_sender, response_receiver) = oneshot::channel();
            // Send request to the background validation task
            // Unwrap is safe here
            sender
                .send(((inputs, truncate), response_sender, Span::current()))
                .unwrap();

            // Await on response channel
            // Unwrap is safe here
            let (inputs, input_length) = response_receiver.await.unwrap()?;

            // Get total tokens
            let total_tokens = input_length + max_new_tokens as usize;

            // Validate MaxTotalTokens
            if total_tokens > self.max_total_tokens {
                return Err(ValidationError::MaxTotalTokens(
                    self.max_total_tokens,
                    input_length,
                    max_new_tokens,
                ));
            }

            // Validate InputLength
            if input_length > self.max_input_length {
                return Err(ValidationError::InputLength(
                    self.max_input_length,
                    input_length,
                ));
            }

            metrics::histogram!("tgi_request_input_length", input_length as f64);
108
            Ok((inputs, input_length))
109
110
111
112
113
114
        }
        // Return inputs without validation
        else {
            // In this case, we don't know the real length in tokens of the inputs
            // However, the inputs will be truncated by the python servers
            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
115
            let input_length = truncate.unwrap_or(self.max_input_length);
116
117

            // Validate MaxNewTokens
118
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
119
120
121
122
123
124
                return Err(ValidationError::MaxNewTokens(
                    self.max_total_tokens - self.max_input_length,
                    max_new_tokens,
                ));
            }

125
            Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
126
127
128
        }
    }

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
129
    /// Validate a payload and get the number of tokens in the input
130
    #[instrument(skip_all)]
Olivier Dehaene's avatar
Olivier Dehaene committed
131
132
133
    pub(crate) async fn validate(
        &self,
        request: GenerateRequest,
134
    ) -> Result<ValidGenerateRequest, ValidationError> {
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
        let GenerateParameters {
            best_of,
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            max_new_tokens,
            stop: stop_sequences,
            truncate,
            seed,
            watermark,
            ..
        } = request.parameters;

        // sampling must be true when best_of > 1
        let best_of = best_of.unwrap_or(1);
        let sampling = do_sample
            || temperature.is_some()
            || top_k.is_some()
            || top_p.is_some()
            || typical_p.is_some();

        if best_of > 1 && !sampling {
            return Err(BestOfSampling);
        }

        let temperature = temperature.unwrap_or(1.0);
        if temperature <= 0.0 {
            return Err(ValidationError::Temperature);
        }

        let repetition_penalty = repetition_penalty.unwrap_or(1.0);
        if repetition_penalty <= 0.0 {
            return Err(ValidationError::RepetitionPenalty);
        }

        // Different because the proto default value is not a valid value
        // for the user
        let top_p = top_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TopP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let typical_p = typical_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TypicalP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let top_k: u32 = top_k
            .map(|value| {
                if value <= 0 {
                    return Err(ValidationError::TopK);
                }
                Ok(value as u32)
            })
            .unwrap_or(Ok(0))?;

        if max_new_tokens == 0 {
            return Err(ValidationError::NegativeMaxNewTokens);
        }

        if stop_sequences.len() > self.max_stop_sequences {
            return Err(ValidationError::StopSequence(
                self.max_stop_sequences,
                stop_sequences.len(),
            ));
        }

        // If seed is None, assign a random one
        let seed = match seed {
            None => thread_rng().gen(),
            Some(seed) => {
                if best_of > 1 {
                    return Err(BestOfSeed);
                }
                seed
            }
        };

        // Check if inputs is empty
        if request.inputs.is_empty() {
            return Err(EmptyInput);
        }

        // Check if truncate is strictly positive and less than max_input_length
        let truncate = truncate
            .map(|value| {
                if value == 0 || value > self.max_input_length {
                    return Err(ValidationError::Truncate(self.max_input_length, value));
                }
                Ok(Some(value))
            })
            .unwrap_or(Ok(None))?;

        // Validate inputs
240
        let (inputs, input_length) = self
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
            .validate_input(request.inputs, truncate, max_new_tokens)
            .await?;

        let parameters = NextTokenChooserParameters {
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            seed,
            watermark,
        };
        let stopping_parameters = StoppingCriteriaParameters {
            max_new_tokens,
            stop_sequences,
            ignore_eos_token: false,
        };

        metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);

        Ok(ValidGenerateRequest {
            inputs,
264
            input_length: input_length as u32,
265
266
267
268
            truncate: truncate.unwrap_or(self.max_input_length) as u32,
            parameters,
            stopping_parameters,
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
269
    }
270
271
272
273
274
275
276
277
278
279
280
281
282
283

    /// Validate the best_of parameter
    #[instrument(skip_all)]
    pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
        if self.max_best_of == 1 && best_of != 1 {
            return Err(ValidationError::BestOfDisabled);
        }

        if best_of > self.max_best_of {
            return Err(ValidationError::BestOf(self.max_best_of, best_of));
        }

        Ok(best_of)
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
284
285
}

286
287
/// Start tokenization workers
fn tokenizer_worker(tokenizer: Tokenizer, receiver: flume::Receiver<TokenizerRequest>) {
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
288
    // Loop over requests
289
    while let Ok(((inputs, truncate), response_tx, parent_span)) = receiver.recv() {
290
291
        parent_span.in_scope(|| {
            response_tx
292
                .send(prepare_input(inputs, truncate, &tokenizer))
293
294
                .unwrap_or(())
        })
295
296
    }
}
Olivier Dehaene's avatar
Olivier Dehaene committed
297

298
299
300
301
/// Get input length and optionally truncate it
fn prepare_input(
    inputs: String,
    truncate: Option<usize>,
302
    tokenizer: &Tokenizer,
303
) -> Result<(String, usize), ValidationError> {
304
305
    // Get the number of tokens in the input
    let mut encoding = tokenizer
306
        .encode(inputs.clone(), true)
307
308
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;

309
310
    // Optionally truncate
    let (inputs, input_length) = match truncate {
311
312
        // Truncate is some and < encoding length
        Some(truncate) if truncate < encoding.len() => {
313
314
315
316
317
318
319
320
321
            // truncate encoding and decode new inputs
            encoding.truncate(truncate, 0, TruncationDirection::Left);
            let inputs = tokenizer
                .decode(Vec::from(encoding.get_ids()), false)
                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
            (inputs, encoding.len())
        }
        // Nothing to do
        _ => (inputs, encoding.len()),
322
323
    };

324
    Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
325
}
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
326

327
328
329
type TokenizerRequest = (
    (String, Option<usize>),
    oneshot::Sender<Result<(String, usize), ValidationError>>,
330
    Span,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
331
332
);

333
334
335
#[derive(Debug)]
pub(crate) struct ValidGenerateRequest {
    pub inputs: String,
336
    pub input_length: u32,
337
    pub truncate: u32,
338
339
340
341
    pub parameters: NextTokenChooserParameters,
    pub stopping_parameters: StoppingCriteriaParameters,
}

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
342
343
#[derive(Error, Debug)]
pub enum ValidationError {
344
345
346
347
348
349
350
351
352
353
    #[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
    BestOf(usize, usize),
    #[error("`best_of` != 1 is not allowed for this endpoint")]
    BestOfDisabled,
    #[error("you must use sampling when `best_of` is > 1")]
    BestOfSampling,
    #[error("`seed` must not be set when `best_of` > 1")]
    BestOfSeed,
    #[error("`best_of` != 1 is not supported when streaming tokens")]
    BestOfStream,
354
    #[error("`temperature` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
355
    Temperature,
356
    #[error("`repetition_penalty` must be strictly positive")]
357
    RepetitionPenalty,
358
    #[error("`top_p` must be > 0.0 and < 1.0")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
359
    TopP,
360
    #[error("`top_k` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
361
    TopK,
362
363
    #[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
    Truncate(usize, usize),
364
365
    #[error("`typical_p` must be > 0.0 and < 1.0")]
    TypicalP,
366
    #[error("`max_new_tokens` must be strictly positive")]
367
368
369
    NegativeMaxNewTokens,
    #[error("`max_new_tokens` must be <= {0}. Given: {1}")]
    MaxNewTokens(usize, u32),
370
    #[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
371
    MaxTotalTokens(usize, usize, u32),
372
    #[error("`inputs` must have less than {0} tokens. Given: {1}")]
373
    InputLength(usize, usize),
374
    #[error("`inputs` cannot be empty")]
375
    EmptyInput,
376
    #[error("`stop` supports up to {0} stop sequences. Given: {1}")]
377
    StopSequence(usize, usize),
378
379
    #[error("tokenizer error {0}")]
    Tokenizer(String),
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
380
}
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429

#[cfg(test)]
mod tests{
    use super::*;
    use std::io::Write;

    #[tokio::test]
    async fn test_validation_max_new_tokens(){
        let tokenizer = None;
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);

        let max_new_tokens = 10;
        match validation.validate_input("Hello".to_string(), None, max_new_tokens).await{
            Err(ValidationError::MaxNewTokens(1, 10)) => (),
            _ => panic!("Unexpected not max new tokens")
        }
    }

    async fn get_tokenizer() -> Tokenizer{
        if !std::path::Path::new("tokenizer.json").exists(){
            let content = reqwest::get("https://huggingface.co/gpt2/raw/main/tokenizer.json").await.unwrap().bytes().await.unwrap();
             let mut file = std::fs::File::create("tokenizer.json").unwrap();
            file.write_all(&content).unwrap();
        }
        Tokenizer::from_file("tokenizer.json").unwrap()
    }

    #[tokio::test]
    async fn test_validation_input_length(){
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);

        let max_new_tokens = 10;
        match validation.validate_input("Hello".to_string(), None, max_new_tokens).await{
            Err(ValidationError::MaxTotalTokens(5, 1, 10)) => (),
            _ => panic!("Unexpected not max new tokens")
        }
    }
}