validation.rs 16.5 KB
Newer Older
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
1
/// Payload validation logic
2
use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
3
use crate::{GenerateParameters, GenerateRequest};
4
use rand::{thread_rng, Rng};
5
use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use thiserror::Error;
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use tokenizers::tokenizer::Tokenizer;
8
use tokenizers::TruncationDirection;
9
use tokio::sync::oneshot;
10
use tracing::{instrument, Span};
Olivier Dehaene's avatar
Olivier Dehaene committed
11

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
12
/// Validation
Olivier Dehaene's avatar
Olivier Dehaene committed
13
#[derive(Debug, Clone)]
Olivier Dehaene's avatar
Olivier Dehaene committed
14
pub struct Validation {
15
    /// Validation parameters
16
    max_best_of: usize,
17
18
19
20
21
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
    /// Channel to communicate with the background tokenization task
    sender: Option<flume::Sender<TokenizerRequest>>,
Olivier Dehaene's avatar
Olivier Dehaene committed
22
23
24
}

impl Validation {
25
26
    pub(crate) fn new(
        workers: usize,
27
        tokenizer: Option<Tokenizer>,
28
        max_best_of: usize,
29
30
31
32
        max_stop_sequences: usize,
        max_input_length: usize,
        max_total_tokens: usize,
    ) -> Self {
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
        // If we have a fast tokenizer
        let sender = if let Some(tokenizer) = tokenizer {
            // Create channel
            let (validation_sender, validation_receiver) = flume::unbounded();

            // Create workers
            for _ in 0..workers {
                let tokenizer_clone = tokenizer.clone();
                let receiver_clone = validation_receiver.clone();

                // Spawn worker
                tokio::task::spawn_blocking(move || {
                    tokenizer_worker(tokenizer_clone, receiver_clone)
                });
            }
            Some(validation_sender)
        } else {
            None
        };

        Self {
            max_best_of,
            sender,
56
            max_stop_sequences,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
57
            max_input_length,
58
            max_total_tokens,
59
60
        }
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
61

62
63
64
65
66
67
    #[instrument(skip_all)]
    async fn validate_input(
        &self,
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: u32,
68
    ) -> Result<(String, usize), ValidationError> {
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
            let (response_sender, response_receiver) = oneshot::channel();
            // Send request to the background validation task
            // Unwrap is safe here
            sender
                .send(((inputs, truncate), response_sender, Span::current()))
                .unwrap();

            // Await on response channel
            // Unwrap is safe here
            let (inputs, input_length) = response_receiver.await.unwrap()?;

            // Get total tokens
            let total_tokens = input_length + max_new_tokens as usize;

            // Validate MaxTotalTokens
            if total_tokens > self.max_total_tokens {
                return Err(ValidationError::MaxTotalTokens(
                    self.max_total_tokens,
                    input_length,
                    max_new_tokens,
                ));
            }

            // Validate InputLength
            if input_length > self.max_input_length {
                return Err(ValidationError::InputLength(
                    self.max_input_length,
                    input_length,
                ));
            }

            metrics::histogram!("tgi_request_input_length", input_length as f64);
104
            Ok((inputs, input_length))
105
106
107
108
109
110
        }
        // Return inputs without validation
        else {
            // In this case, we don't know the real length in tokens of the inputs
            // However, the inputs will be truncated by the python servers
            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
111
            let input_length = truncate.unwrap_or(self.max_input_length);
112
113

            // Validate MaxNewTokens
114
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
115
116
117
118
119
120
                return Err(ValidationError::MaxNewTokens(
                    self.max_total_tokens - self.max_input_length,
                    max_new_tokens,
                ));
            }

121
            Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
122
123
124
        }
    }

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
125
    /// Validate a payload and get the number of tokens in the input
126
    #[instrument(skip_all)]
Olivier Dehaene's avatar
Olivier Dehaene committed
127
128
129
    pub(crate) async fn validate(
        &self,
        request: GenerateRequest,
130
    ) -> Result<ValidGenerateRequest, ValidationError> {
131
132
133
134
135
136
137
138
139
140
141
142
143
        let GenerateParameters {
            best_of,
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            max_new_tokens,
            stop: stop_sequences,
            truncate,
            seed,
            watermark,
144
            decoder_input_details,
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
            ..
        } = request.parameters;

        // sampling must be true when best_of > 1
        let best_of = best_of.unwrap_or(1);
        let sampling = do_sample
            || temperature.is_some()
            || top_k.is_some()
            || top_p.is_some()
            || typical_p.is_some();

        if best_of > 1 && !sampling {
            return Err(BestOfSampling);
        }

        let temperature = temperature.unwrap_or(1.0);
        if temperature <= 0.0 {
            return Err(ValidationError::Temperature);
        }

        let repetition_penalty = repetition_penalty.unwrap_or(1.0);
        if repetition_penalty <= 0.0 {
            return Err(ValidationError::RepetitionPenalty);
        }

        // Different because the proto default value is not a valid value
        // for the user
        let top_p = top_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TopP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let typical_p = typical_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TypicalP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let top_k: u32 = top_k
            .map(|value| {
                if value <= 0 {
                    return Err(ValidationError::TopK);
                }
                Ok(value as u32)
            })
            .unwrap_or(Ok(0))?;

        if max_new_tokens == 0 {
            return Err(ValidationError::NegativeMaxNewTokens);
        }

        if stop_sequences.len() > self.max_stop_sequences {
            return Err(ValidationError::StopSequence(
                self.max_stop_sequences,
                stop_sequences.len(),
            ));
        }

        // If seed is None, assign a random one
        let seed = match seed {
            None => thread_rng().gen(),
            Some(seed) => {
                if best_of > 1 {
                    return Err(BestOfSeed);
                }
                seed
            }
        };

        // Check if inputs is empty
        if request.inputs.is_empty() {
            return Err(EmptyInput);
        }

        // Check if truncate is strictly positive and less than max_input_length
        let truncate = truncate
            .map(|value| {
                if value == 0 || value > self.max_input_length {
                    return Err(ValidationError::Truncate(self.max_input_length, value));
                }
                Ok(Some(value))
            })
            .unwrap_or(Ok(None))?;

        // Validate inputs
237
        let (inputs, input_length) = self
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
            .validate_input(request.inputs, truncate, max_new_tokens)
            .await?;

        let parameters = NextTokenChooserParameters {
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            seed,
            watermark,
        };
        let stopping_parameters = StoppingCriteriaParameters {
            max_new_tokens,
            stop_sequences,
            ignore_eos_token: false,
        };

        metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);

        Ok(ValidGenerateRequest {
            inputs,
261
            decoder_input_details,
262
            input_length: input_length as u32,
263
264
265
266
            truncate: truncate.unwrap_or(self.max_input_length) as u32,
            parameters,
            stopping_parameters,
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
267
    }
268
269
270
271
272
273
274
275
276
277
278
279
280
281

    /// Validate the best_of parameter
    #[instrument(skip_all)]
    pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
        if self.max_best_of == 1 && best_of != 1 {
            return Err(ValidationError::BestOfDisabled);
        }

        if best_of > self.max_best_of {
            return Err(ValidationError::BestOf(self.max_best_of, best_of));
        }

        Ok(best_of)
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
282
283
}

284
285
/// Start tokenization workers
fn tokenizer_worker(tokenizer: Tokenizer, receiver: flume::Receiver<TokenizerRequest>) {
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
286
    // Loop over requests
287
    while let Ok(((inputs, truncate), response_tx, parent_span)) = receiver.recv() {
288
289
        parent_span.in_scope(|| {
            response_tx
290
                .send(prepare_input(inputs, truncate, &tokenizer))
291
292
                .unwrap_or(())
        })
293
294
    }
}
Olivier Dehaene's avatar
Olivier Dehaene committed
295

296
297
298
299
/// Get input length and optionally truncate it
fn prepare_input(
    inputs: String,
    truncate: Option<usize>,
300
    tokenizer: &Tokenizer,
301
) -> Result<(String, usize), ValidationError> {
302
303
    // Get the number of tokens in the input
    let mut encoding = tokenizer
304
        .encode(inputs.clone(), true)
305
306
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;

307
308
    // Optionally truncate
    let (inputs, input_length) = match truncate {
309
310
        // Truncate is some and < encoding length
        Some(truncate) if truncate < encoding.len() => {
311
312
313
314
315
316
317
318
319
            // truncate encoding and decode new inputs
            encoding.truncate(truncate, 0, TruncationDirection::Left);
            let inputs = tokenizer
                .decode(Vec::from(encoding.get_ids()), false)
                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
            (inputs, encoding.len())
        }
        // Nothing to do
        _ => (inputs, encoding.len()),
320
321
    };

322
    Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
323
}
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
324

325
326
327
type TokenizerRequest = (
    (String, Option<usize>),
    oneshot::Sender<Result<(String, usize), ValidationError>>,
328
    Span,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
329
330
);

331
332
333
#[derive(Debug)]
pub(crate) struct ValidGenerateRequest {
    pub inputs: String,
334
    pub input_length: u32,
335
    pub truncate: u32,
336
    pub decoder_input_details: bool,
337
338
339
340
    pub parameters: NextTokenChooserParameters,
    pub stopping_parameters: StoppingCriteriaParameters,
}

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
341
342
#[derive(Error, Debug)]
pub enum ValidationError {
343
344
345
346
347
348
349
350
351
352
    #[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
    BestOf(usize, usize),
    #[error("`best_of` != 1 is not allowed for this endpoint")]
    BestOfDisabled,
    #[error("you must use sampling when `best_of` is > 1")]
    BestOfSampling,
    #[error("`seed` must not be set when `best_of` > 1")]
    BestOfSeed,
    #[error("`best_of` != 1 is not supported when streaming tokens")]
    BestOfStream,
353
354
    #[error("`decoder_input_details` == true is not supported when streaming tokens")]
    PrefillDetailsStream,
355
    #[error("`temperature` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
356
    Temperature,
357
    #[error("`repetition_penalty` must be strictly positive")]
358
    RepetitionPenalty,
359
    #[error("`top_p` must be > 0.0 and < 1.0")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
360
    TopP,
361
    #[error("`top_k` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
362
    TopK,
363
364
    #[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
    Truncate(usize, usize),
365
366
    #[error("`typical_p` must be > 0.0 and < 1.0")]
    TypicalP,
367
    #[error("`max_new_tokens` must be strictly positive")]
368
369
370
    NegativeMaxNewTokens,
    #[error("`max_new_tokens` must be <= {0}. Given: {1}")]
    MaxNewTokens(usize, u32),
371
    #[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
372
    MaxTotalTokens(usize, usize, u32),
373
    #[error("`inputs` must have less than {0} tokens. Given: {1}")]
374
    InputLength(usize, usize),
375
    #[error("`inputs` cannot be empty")]
376
    EmptyInput,
377
    #[error("`stop` supports up to {0} stop sequences. Given: {1}")]
378
    StopSequence(usize, usize),
379
380
    #[error("tokenizer error {0}")]
    Tokenizer(String),
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
381
}
382
383

#[cfg(test)]
384
mod tests {
385
    use super::*;
386
387
    use crate::default_parameters;
    use crate::tests::get_tokenizer;
388
389

    #[tokio::test]
390
    async fn test_validation_max_new_tokens() {
391
392
393
394
395
396
        let tokenizer = None;
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
397
398
399
400
401
402
403
404
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
405
406

        let max_new_tokens = 10;
407
408
409
410
        match validation
            .validate_input("Hello".to_string(), None, max_new_tokens)
            .await
        {
411
            Err(ValidationError::MaxNewTokens(1, 10)) => (),
412
            _ => panic!("Unexpected not max new tokens"),
413
414
415
416
        }
    }

    #[tokio::test]
417
    async fn test_validation_input_length() {
418
419
420
421
422
423
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
424
425
426
427
428
429
430
431
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
432
433

        let max_new_tokens = 10;
434
435
436
437
        match validation
            .validate_input("Hello".to_string(), None, max_new_tokens)
            .await
        {
438
            Err(ValidationError::MaxTotalTokens(5, 1, 10)) => (),
439
            _ => panic!("Unexpected not max new tokens"),
440
441
        }
    }
442
443

    #[tokio::test]
444
    async fn test_validation_best_of_sampling() {
445
446
447
448
449
450
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    best_of: Some(2),
                    do_sample: false,
                    ..default_parameters()
                },
            })
            .await
        {
470
            Err(ValidationError::BestOfSampling) => (),
471
            _ => panic!("Unexpected not best of sampling"),
472
473
474
475
        }
    }

    #[tokio::test]
476
    async fn test_validation_top_p() {
477
478
479
480
481
482
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(1.0),
                    ..default_parameters()
                },
            })
            .await
        {
501
            Err(ValidationError::TopP) => (),
502
            _ => panic!("Unexpected top_p"),
503
504
        }

505
506
507
508
509
510
511
512
513
514
515
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(0.99),
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
            .await
        {
516
            Ok(_) => (),
517
            _ => panic!("Unexpected top_p error"),
518
519
        }

520
521
522
523
524
525
526
527
528
529
530
        let valid_request = validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: None,
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
            .await
            .unwrap();
531
532
533
        // top_p == 1.0 is invalid for users to ask for but it's the default resolved value.
        assert_eq!(valid_request.parameters.top_p, 1.0);
    }
534
}