queue.rs 12.1 KB
Newer Older
1
2
3
4
5
6
use crate::infer::InferError;
use crate::infer::InferStreamResponse;
use crate::validation::ValidGenerateRequest;
use nohash_hasher::{BuildNoHashHasher, IntMap};
use std::cmp::min;
use text_generation_client::{Batch, Request};
7
use tokio::sync::{oneshot, OwnedSemaphorePermit};
8
use tokio::time::Instant;
9
use tracing::{info_span, instrument, Span};
10
11
12
13
14
15
16

/// Queue entry
#[derive(Debug)]
pub(crate) struct Entry {
    /// Request
    pub request: ValidGenerateRequest,
    /// Response sender to communicate between the Infer struct and the batching_task
17
    pub response_tx: flume::Sender<Result<InferStreamResponse, InferError>>,
18
19
20
21
22
23
    /// Span that will live as long as entry
    pub span: Span,
    /// Temporary span used as a guard when logging inference, wait times...
    pub temp_span: Option<Span>,
    /// Instant when this entry was queued
    pub queue_time: Instant,
24
25
26
27
28
29
30
31
32
33
    /// Instant when this entry was added to a batch
    pub batch_time: Option<Instant>,
    /// Permit
    pub _permit: OwnedSemaphorePermit,
}

/// Request Queue
#[derive(Debug, Clone)]
pub(crate) struct Queue {
    /// Channel to communicate with the background queue task
34
    queue_sender: flume::Sender<QueueCommand>,
35
36
37
38
39
}

impl Queue {
    pub(crate) fn new() -> Self {
        // Create channel
40
        let (queue_sender, queue_receiver) = flume::unbounded();
41
42
43
44
45
46
47
48

        // Launch background queue task
        tokio::spawn(queue_task(queue_receiver));

        Self { queue_sender }
    }

    /// Append an entry to the queue
49
    #[instrument(skip_all)]
50
51
52
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
        // Unwrap is safe here
53
54
55
        self.queue_sender
            .send(QueueCommand::Append(entry, Span::current()))
            .unwrap();
56
57
58
    }

    // Get the next batch
59
    #[instrument(skip(self))]
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    pub(crate) async fn next_batch(
        &self,
        min_size: Option<usize>,
        max_size: usize,
    ) -> Option<NextBatch> {
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send next batch command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::NextBatch {
                min_size,
                max_size,
                response_sender,
74
                span: Span::current(),
75
76
77
78
79
80
81
82
83
            })
            .unwrap();
        // Await on response channel
        // Unwrap is safe here
        response_receiver.await.unwrap()
    }
}

// Background task responsible of the queue state
84
async fn queue_task(receiver: flume::Receiver<QueueCommand>) {
85
86
    let mut state = State::new();

87
    while let Ok(cmd) = receiver.recv_async().await {
88
        match cmd {
89
            QueueCommand::Append(entry, span) => span.in_scope(|| state.append(entry)),
90
91
92
93
            QueueCommand::NextBatch {
                min_size,
                max_size,
                response_sender,
94
95
                span,
            } => span.in_scope(|| {
96
97
                let next_batch = state.next_batch(min_size, max_size);
                response_sender.send(next_batch).unwrap_or(());
98
            }),
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        }
    }
}

/// Queue State
#[derive(Debug)]
struct State {
    /// Queue entries organized in a Vec
    entries: Vec<(u64, Entry)>,

    /// Id of the next entry
    next_id: u64,

    /// Id of the next batch
    next_batch_id: u64,
}

impl State {
    fn new() -> Self {
        Self {
            entries: Vec::with_capacity(128),
            next_id: 0,
            next_batch_id: 0,
        }
    }

    /// Append an entry to the queue
126
127
128
129
130
131
    fn append(&mut self, mut entry: Entry) {
        // Create a span that will live as long as the entry is in the queue waiting to be batched
        let queue_span = info_span!(parent: &entry.span, "queued");
        entry.temp_span = Some(queue_span);

        // Push entry in the queue
132
133
        self.entries.push((self.next_id, entry));
        self.next_id += 1;
134
        metrics::increment_gauge!("tgi_queue_size", 1.0);
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    }

    // Get the next batch
    fn next_batch(&mut self, min_size: Option<usize>, max_size: usize) -> Option<NextBatch> {
        if self.entries.is_empty() {
            return None;
        }

        // Check if we have enough entries
        if let Some(min_size) = min_size {
            if self.entries.len() < min_size {
                return None;
            }
        }

        let next_batch_size = min(self.entries.len(), max_size);

152
153
154
155
        // Create span for this batch to add context to inference calls
        let next_batch_span = info_span!(parent: None, "batch", batch_size = next_batch_size);
        next_batch_span.follows_from(&Span::current());

156
157
158
159
160
161
162
163
        let mut batch_requests = Vec::with_capacity(next_batch_size);
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(next_batch_size, BuildNoHashHasher::default());

        // Drain next_batch_size entries
        self.entries
            .drain(..next_batch_size)
            .for_each(|(id, mut entry)| {
164
165
166
                // Create a new span to link the batch back to this entry
                let entry_batch_span =
                    info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
167
168
                // Add relationships
                next_batch_span.follows_from(&entry_batch_span);
169
170
171
172
                entry_batch_span.follows_from(&next_batch_span);
                // Update entry
                entry.temp_span = Some(entry_batch_span);

173
174
175
                batch_requests.push(Request {
                    id,
                    inputs: entry.request.inputs.clone(),
176
                    truncate: entry.request.truncate,
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
                    parameters: Some(entry.request.parameters.clone()),
                    stopping_parameters: Some(entry.request.stopping_parameters.clone()),
                });
                // Set batch_time
                entry.batch_time = Some(Instant::now());
                // Insert in batch_entries IntMap
                batch_entries.insert(id, entry);
            });

        let batch = Batch {
            id: self.next_batch_id,
            requests: batch_requests,
            size: next_batch_size as u32,
        };
        // Increment batch id
        self.next_batch_id += 1;

194
195
        metrics::gauge!("tgi_queue_size", self.entries.len() as f64);
        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
196
        Some((batch_entries, batch, next_batch_span))
197
198
199
    }
}

200
type NextBatch = (IntMap<u64, Entry>, Batch, Span);
201
202
203

#[derive(Debug)]
enum QueueCommand {
204
    Append(Entry, Span),
205
206
207
208
    NextBatch {
        min_size: Option<usize>,
        max_size: usize,
        response_sender: oneshot::Sender<Option<NextBatch>>,
209
        span: Span,
210
211
212
213
214
215
216
217
    },
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::Arc;
    use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
218
    use tokio::sync::Semaphore;
219
    use tracing::info_span;
220
221
222

    fn default_entry() -> Entry {
        let semaphore = Arc::new(Semaphore::new(1));
223
        let (response_tx, _) = flume::unbounded();
224
225
226
227
228
        let permit = semaphore.try_acquire_owned().unwrap();

        Entry {
            request: ValidGenerateRequest {
                inputs: "".to_string(),
229
                truncate: 0,
230
231
232
233
                parameters: NextTokenChooserParameters {
                    temperature: 0.0,
                    top_k: 0,
                    top_p: 0.0,
234
                    typical_p: 0.0,
235
236
237
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 0.0,
238
                    watermark: false,
239
240
                },
                stopping_parameters: StoppingCriteriaParameters {
241
                    ignore_eos_token: false,
242
243
244
245
246
                    max_new_tokens: 0,
                    stop_sequences: vec![],
                },
            },
            response_tx,
247
248
249
            span: info_span!("entry"),
            temp_span: None,
            queue_time: Instant::now(),
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
            batch_time: None,
            _permit: permit,
        }
    }

    #[test]
    fn test_append() {
        let mut state = State::new();
        let entry = default_entry();

        assert_eq!(state.next_id, 0);
        assert_eq!(state.entries.len(), 0);

        state.append(entry);

        assert_eq!(state.next_id, 1);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0);
        assert_eq!(id, 0);
    }

    #[test]
    fn test_next_batch_empty() {
        let mut state = State::new();

        assert!(state.next_batch(None, 1).is_none());
        assert!(state.next_batch(Some(1), 1).is_none());
    }

    #[test]
    fn test_next_batch_min_size() {
        let mut state = State::new();
        state.append(default_entry());
        state.append(default_entry());

285
        let (entries, batch, _) = state.next_batch(None, 2).unwrap();
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 1);

        state.append(default_entry());

        assert!(state.next_batch(Some(2), 2).is_none());

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0);
        assert_eq!(id, 2);
    }

    #[test]
    fn test_next_batch_max_size() {
        let mut state = State::new();
        state.append(default_entry());
        state.append(default_entry());

314
        let (entries, batch, _) = state.next_batch(None, 1).unwrap();
315
316
317
318
319
320
321
322
323
324
325
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);

        state.append(default_entry());

326
        let (entries, batch, _) = state.next_batch(None, 3).unwrap();
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 2);
    }

    #[tokio::test]
    async fn test_queue_append() {
        let queue = Queue::new();
        queue.append(default_entry());
    }

    #[tokio::test]
    async fn test_queue_next_batch_empty() {
        let queue = Queue::new();

        assert!(queue.next_batch(None, 1).await.is_none());
        assert!(queue.next_batch(Some(1), 1).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_min_size() {
        let queue = Queue::new();
        queue.append(default_entry());
        queue.append(default_entry());

358
        let (entries, batch, _) = queue.next_batch(None, 2).await.unwrap();
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        queue.append(default_entry());

        assert!(queue.next_batch(Some(2), 2).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_max_size() {
        let queue = Queue::new();
        queue.append(default_entry());
        queue.append(default_entry());

378
        let (entries, batch, _) = queue.next_batch(None, 1).await.unwrap();
379
380
381
382
383
384
385
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        queue.append(default_entry());

386
        let (entries, batch, _) = queue.next_batch(None, 3).await.unwrap();
387
388
389
390
391
392
393
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);
    }
}