validation.rs 16.6 KB
Newer Older
1
use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
2
/// Payload validation logic
3
use crate::{GenerateParameters, GenerateRequest};
4
use rand::{thread_rng, Rng};
5
use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
Olivier Dehaene's avatar
Olivier Dehaene committed
6
use thiserror::Error;
Olivier Dehaene's avatar
Olivier Dehaene committed
7
use tokenizers::tokenizer::Tokenizer;
8
use tokenizers::TruncationDirection;
9
use tokio::sync::oneshot;
10
use tracing::{instrument, Span};
Olivier Dehaene's avatar
Olivier Dehaene committed
11

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
12
/// Validation
Olivier Dehaene's avatar
Olivier Dehaene committed
13
#[derive(Debug, Clone)]
Olivier Dehaene's avatar
Olivier Dehaene committed
14
pub struct Validation {
15
    /// Validation parameters
16
    max_best_of: usize,
17
18
19
20
21
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
    /// Channel to communicate with the background tokenization task
    sender: Option<flume::Sender<TokenizerRequest>>,
Olivier Dehaene's avatar
Olivier Dehaene committed
22
23
24
}

impl Validation {
25
26
    pub(crate) fn new(
        workers: usize,
27
        tokenizer: Option<Tokenizer>,
28
        max_best_of: usize,
29
30
31
32
        max_stop_sequences: usize,
        max_input_length: usize,
        max_total_tokens: usize,
    ) -> Self {
33
34
35
        if max_input_length >= max_total_tokens {
            panic!("`max_input_length` must be < `max_total_tokens`");
        }
Olivier Dehaene's avatar
Olivier Dehaene committed
36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
        // If we have a fast tokenizer
        let sender = if let Some(tokenizer) = tokenizer {
            // Create channel
            let (validation_sender, validation_receiver) = flume::unbounded();

            // Create workers
            for _ in 0..workers {
                let tokenizer_clone = tokenizer.clone();
                let receiver_clone = validation_receiver.clone();

                // Spawn worker
                tokio::task::spawn_blocking(move || {
                    tokenizer_worker(tokenizer_clone, receiver_clone)
                });
            }
            Some(validation_sender)
        } else {
            None
        };

        Self {
            max_best_of,
            sender,
60
            max_stop_sequences,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
61
            max_input_length,
62
            max_total_tokens,
63
64
        }
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
65

66
67
68
69
70
71
    #[instrument(skip_all)]
    async fn validate_input(
        &self,
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: u32,
72
    ) -> Result<(String, usize), ValidationError> {
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
            let (response_sender, response_receiver) = oneshot::channel();
            // Send request to the background validation task
            // Unwrap is safe here
            sender
                .send(((inputs, truncate), response_sender, Span::current()))
                .unwrap();

            // Await on response channel
            // Unwrap is safe here
            let (inputs, input_length) = response_receiver.await.unwrap()?;

            // Get total tokens
            let total_tokens = input_length + max_new_tokens as usize;

            // Validate MaxTotalTokens
            if total_tokens > self.max_total_tokens {
                return Err(ValidationError::MaxTotalTokens(
                    self.max_total_tokens,
                    input_length,
                    max_new_tokens,
                ));
            }

            // Validate InputLength
            if input_length > self.max_input_length {
                return Err(ValidationError::InputLength(
                    self.max_input_length,
                    input_length,
                ));
            }

            metrics::histogram!("tgi_request_input_length", input_length as f64);
108
            Ok((inputs, input_length))
109
110
111
112
113
114
        }
        // Return inputs without validation
        else {
            // In this case, we don't know the real length in tokens of the inputs
            // However, the inputs will be truncated by the python servers
            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
115
            let input_length = truncate.unwrap_or(self.max_input_length);
116
117

            // Validate MaxNewTokens
118
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
119
120
121
122
123
124
                return Err(ValidationError::MaxNewTokens(
                    self.max_total_tokens - self.max_input_length,
                    max_new_tokens,
                ));
            }

125
            Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
126
127
128
        }
    }

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
129
    /// Validate a payload and get the number of tokens in the input
130
    #[instrument(skip_all)]
Olivier Dehaene's avatar
Olivier Dehaene committed
131
132
133
    pub(crate) async fn validate(
        &self,
        request: GenerateRequest,
134
    ) -> Result<ValidGenerateRequest, ValidationError> {
135
136
137
138
139
140
141
142
143
144
145
146
147
        let GenerateParameters {
            best_of,
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            max_new_tokens,
            stop: stop_sequences,
            truncate,
            seed,
            watermark,
148
            decoder_input_details,
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
            ..
        } = request.parameters;

        // sampling must be true when best_of > 1
        let best_of = best_of.unwrap_or(1);
        let sampling = do_sample
            || temperature.is_some()
            || top_k.is_some()
            || top_p.is_some()
            || typical_p.is_some();

        if best_of > 1 && !sampling {
            return Err(BestOfSampling);
        }

        let temperature = temperature.unwrap_or(1.0);
        if temperature <= 0.0 {
            return Err(ValidationError::Temperature);
        }

        let repetition_penalty = repetition_penalty.unwrap_or(1.0);
        if repetition_penalty <= 0.0 {
            return Err(ValidationError::RepetitionPenalty);
        }

        // Different because the proto default value is not a valid value
        // for the user
        let top_p = top_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TopP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let typical_p = typical_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TypicalP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let top_k: u32 = top_k
            .map(|value| {
                if value <= 0 {
                    return Err(ValidationError::TopK);
                }
                Ok(value as u32)
            })
            .unwrap_or(Ok(0))?;

        if max_new_tokens == 0 {
            return Err(ValidationError::NegativeMaxNewTokens);
        }

        if stop_sequences.len() > self.max_stop_sequences {
            return Err(ValidationError::StopSequence(
                self.max_stop_sequences,
                stop_sequences.len(),
            ));
        }

        // If seed is None, assign a random one
        let seed = match seed {
            None => thread_rng().gen(),
            Some(seed) => {
                if best_of > 1 {
                    return Err(BestOfSeed);
                }
                seed
            }
        };

        // Check if inputs is empty
        if request.inputs.is_empty() {
            return Err(EmptyInput);
        }

        // Check if truncate is strictly positive and less than max_input_length
        let truncate = truncate
            .map(|value| {
                if value == 0 || value > self.max_input_length {
                    return Err(ValidationError::Truncate(self.max_input_length, value));
                }
                Ok(Some(value))
            })
            .unwrap_or(Ok(None))?;

        // Validate inputs
241
        let (inputs, input_length) = self
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
            .validate_input(request.inputs, truncate, max_new_tokens)
            .await?;

        let parameters = NextTokenChooserParameters {
            temperature,
            repetition_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            seed,
            watermark,
        };
        let stopping_parameters = StoppingCriteriaParameters {
            max_new_tokens,
            stop_sequences,
            ignore_eos_token: false,
        };

        metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);

        Ok(ValidGenerateRequest {
            inputs,
265
            decoder_input_details,
266
            input_length: input_length as u32,
267
268
269
270
            truncate: truncate.unwrap_or(self.max_input_length) as u32,
            parameters,
            stopping_parameters,
        })
Olivier Dehaene's avatar
Olivier Dehaene committed
271
    }
272
273
274
275
276
277
278
279
280
281
282
283
284
285

    /// Validate the best_of parameter
    #[instrument(skip_all)]
    pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
        if self.max_best_of == 1 && best_of != 1 {
            return Err(ValidationError::BestOfDisabled);
        }

        if best_of > self.max_best_of {
            return Err(ValidationError::BestOf(self.max_best_of, best_of));
        }

        Ok(best_of)
    }
Olivier Dehaene's avatar
Olivier Dehaene committed
286
287
}

288
289
/// Start tokenization workers
fn tokenizer_worker(tokenizer: Tokenizer, receiver: flume::Receiver<TokenizerRequest>) {
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
290
    // Loop over requests
291
    while let Ok(((inputs, truncate), response_tx, parent_span)) = receiver.recv() {
292
293
        parent_span.in_scope(|| {
            response_tx
294
                .send(prepare_input(inputs, truncate, &tokenizer))
295
296
                .unwrap_or(())
        })
297
298
    }
}
Olivier Dehaene's avatar
Olivier Dehaene committed
299

300
301
302
303
/// Get input length and optionally truncate it
fn prepare_input(
    inputs: String,
    truncate: Option<usize>,
304
    tokenizer: &Tokenizer,
305
) -> Result<(String, usize), ValidationError> {
306
307
    // Get the number of tokens in the input
    let mut encoding = tokenizer
308
        .encode(inputs.clone(), true)
309
310
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;

311
312
    // Optionally truncate
    let (inputs, input_length) = match truncate {
313
314
        // Truncate is some and < encoding length
        Some(truncate) if truncate < encoding.len() => {
315
316
317
318
319
320
321
322
323
            // truncate encoding and decode new inputs
            encoding.truncate(truncate, 0, TruncationDirection::Left);
            let inputs = tokenizer
                .decode(Vec::from(encoding.get_ids()), false)
                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
            (inputs, encoding.len())
        }
        // Nothing to do
        _ => (inputs, encoding.len()),
324
325
    };

326
    Ok((inputs, input_length))
Olivier Dehaene's avatar
Olivier Dehaene committed
327
}
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
328

329
330
331
type TokenizerRequest = (
    (String, Option<usize>),
    oneshot::Sender<Result<(String, usize), ValidationError>>,
332
    Span,
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
333
334
);

335
336
337
#[derive(Debug)]
pub(crate) struct ValidGenerateRequest {
    pub inputs: String,
338
    pub input_length: u32,
339
    pub truncate: u32,
340
    pub decoder_input_details: bool,
341
342
343
344
    pub parameters: NextTokenChooserParameters,
    pub stopping_parameters: StoppingCriteriaParameters,
}

Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
345
346
#[derive(Error, Debug)]
pub enum ValidationError {
347
348
349
350
351
352
353
354
355
356
    #[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
    BestOf(usize, usize),
    #[error("`best_of` != 1 is not allowed for this endpoint")]
    BestOfDisabled,
    #[error("you must use sampling when `best_of` is > 1")]
    BestOfSampling,
    #[error("`seed` must not be set when `best_of` > 1")]
    BestOfSeed,
    #[error("`best_of` != 1 is not supported when streaming tokens")]
    BestOfStream,
357
358
    #[error("`decoder_input_details` == true is not supported when streaming tokens")]
    PrefillDetailsStream,
359
    #[error("`temperature` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
360
    Temperature,
361
    #[error("`repetition_penalty` must be strictly positive")]
362
    RepetitionPenalty,
363
    #[error("`top_p` must be > 0.0 and < 1.0")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
364
    TopP,
365
    #[error("`top_k` must be strictly positive")]
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
366
    TopK,
367
368
    #[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
    Truncate(usize, usize),
369
370
    #[error("`typical_p` must be > 0.0 and < 1.0")]
    TypicalP,
371
    #[error("`max_new_tokens` must be strictly positive")]
372
373
374
    NegativeMaxNewTokens,
    #[error("`max_new_tokens` must be <= {0}. Given: {1}")]
    MaxNewTokens(usize, u32),
375
    #[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
376
    MaxTotalTokens(usize, usize, u32),
377
    #[error("`inputs` must have less than {0} tokens. Given: {1}")]
378
    InputLength(usize, usize),
379
    #[error("`inputs` cannot be empty")]
380
    EmptyInput,
381
    #[error("`stop` supports up to {0} stop sequences. Given: {1}")]
382
    StopSequence(usize, usize),
383
384
    #[error("tokenizer error {0}")]
    Tokenizer(String),
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
385
}
386
387

#[cfg(test)]
388
mod tests {
389
    use super::*;
390
391
    use crate::default_parameters;
    use crate::tests::get_tokenizer;
392
393

    #[tokio::test]
394
    async fn test_validation_max_new_tokens() {
395
396
397
398
399
400
        let tokenizer = None;
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
401
402
403
404
405
406
407
408
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
409
410

        let max_new_tokens = 10;
411
412
413
414
        match validation
            .validate_input("Hello".to_string(), None, max_new_tokens)
            .await
        {
415
            Err(ValidationError::MaxNewTokens(1, 10)) => (),
416
            _ => panic!("Unexpected not max new tokens"),
417
418
419
420
        }
    }

    #[tokio::test]
421
    async fn test_validation_input_length() {
422
423
424
425
426
427
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
428
429
430
431
432
433
434
435
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
436
437

        let max_new_tokens = 10;
438
439
440
441
        match validation
            .validate_input("Hello".to_string(), None, max_new_tokens)
            .await
        {
442
            Err(ValidationError::MaxTotalTokens(5, 1, 10)) => (),
443
            _ => panic!("Unexpected not max new tokens"),
444
445
        }
    }
446
447

    #[tokio::test]
448
    async fn test_validation_best_of_sampling() {
449
450
451
452
453
454
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    best_of: Some(2),
                    do_sample: false,
                    ..default_parameters()
                },
            })
            .await
        {
474
            Err(ValidationError::BestOfSampling) => (),
475
            _ => panic!("Unexpected not best of sampling"),
476
477
478
479
        }
    }

    #[tokio::test]
480
    async fn test_validation_top_p() {
481
482
483
484
485
486
        let tokenizer = Some(get_tokenizer().await);
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
        let validation = Validation::new(
            workers,
            tokenizer,
            max_best_of,
            max_stop_sequence,
            max_input_length,
            max_total_tokens,
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(1.0),
                    ..default_parameters()
                },
            })
            .await
        {
505
            Err(ValidationError::TopP) => (),
506
            _ => panic!("Unexpected top_p"),
507
508
        }

509
510
511
512
513
514
515
516
517
518
519
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(0.99),
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
            .await
        {
520
            Ok(_) => (),
521
            _ => panic!("Unexpected top_p error"),
522
523
        }

524
525
526
527
528
529
530
531
532
533
534
        let valid_request = validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: None,
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
            .await
            .unwrap();
535
536
537
        // top_p == 1.0 is invalid for users to ask for but it's the default resolved value.
        assert_eq!(valid_request.parameters.top_p, 1.0);
    }
538
}