queue.rs 12.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
use crate::infer::InferError;
use crate::infer::InferStreamResponse;
use crate::validation::ValidGenerateRequest;
use nohash_hasher::{BuildNoHashHasher, IntMap};
use std::cmp::min;
use text_generation_client::{Batch, Request};
use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
use tokio::sync::{mpsc, oneshot, OwnedSemaphorePermit};
use tokio::time::Instant;
10
use tracing::{info_span, instrument, Span};
11
12
13
14
15
16
17
18

/// Queue entry
#[derive(Debug)]
pub(crate) struct Entry {
    /// Request
    pub request: ValidGenerateRequest,
    /// Response sender to communicate between the Infer struct and the batching_task
    pub response_tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
19
20
21
22
23
24
    /// Span that will live as long as entry
    pub span: Span,
    /// Temporary span used as a guard when logging inference, wait times...
    pub temp_span: Option<Span>,
    /// Instant when this entry was queued
    pub queue_time: Instant,
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    /// Instant when this entry was added to a batch
    pub batch_time: Option<Instant>,
    /// Permit
    pub _permit: OwnedSemaphorePermit,
}

/// Request Queue
#[derive(Debug, Clone)]
pub(crate) struct Queue {
    /// Channel to communicate with the background queue task
    queue_sender: UnboundedSender<QueueCommand>,
}

impl Queue {
    pub(crate) fn new() -> Self {
        // Create channel
        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();

        // Launch background queue task
        tokio::spawn(queue_task(queue_receiver));

        Self { queue_sender }
    }

    /// Append an entry to the queue
50
    #[instrument(skip_all)]
51
52
53
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
        // Unwrap is safe here
54
55
56
        self.queue_sender
            .send(QueueCommand::Append(entry, Span::current()))
            .unwrap();
57
58
59
    }

    // Get the next batch
60
    #[instrument(skip(self))]
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    pub(crate) async fn next_batch(
        &self,
        min_size: Option<usize>,
        max_size: usize,
    ) -> Option<NextBatch> {
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send next batch command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::NextBatch {
                min_size,
                max_size,
                response_sender,
75
                span: Span::current(),
76
77
78
79
80
81
82
83
84
85
86
87
88
89
            })
            .unwrap();
        // Await on response channel
        // Unwrap is safe here
        response_receiver.await.unwrap()
    }
}

// Background task responsible of the queue state
async fn queue_task(mut receiver: UnboundedReceiver<QueueCommand>) {
    let mut state = State::new();

    while let Some(cmd) = receiver.recv().await {
        match cmd {
90
            QueueCommand::Append(entry, span) => span.in_scope(|| state.append(entry)),
91
92
93
94
            QueueCommand::NextBatch {
                min_size,
                max_size,
                response_sender,
95
96
                span,
            } => span.in_scope(|| {
97
98
                let next_batch = state.next_batch(min_size, max_size);
                response_sender.send(next_batch).unwrap_or(());
99
            }),
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
        }
    }
}

/// Queue State
#[derive(Debug)]
struct State {
    /// Queue entries organized in a Vec
    entries: Vec<(u64, Entry)>,

    /// Id of the next entry
    next_id: u64,

    /// Id of the next batch
    next_batch_id: u64,
}

impl State {
    fn new() -> Self {
        Self {
            entries: Vec::with_capacity(128),
            next_id: 0,
            next_batch_id: 0,
        }
    }

    /// Append an entry to the queue
127
128
129
130
131
132
    fn append(&mut self, mut entry: Entry) {
        // Create a span that will live as long as the entry is in the queue waiting to be batched
        let queue_span = info_span!(parent: &entry.span, "queued");
        entry.temp_span = Some(queue_span);

        // Push entry in the queue
133
134
        self.entries.push((self.next_id, entry));
        self.next_id += 1;
135
        metrics::increment_gauge!("tgi_queue_size", 1.0);
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    }

    // Get the next batch
    fn next_batch(&mut self, min_size: Option<usize>, max_size: usize) -> Option<NextBatch> {
        if self.entries.is_empty() {
            return None;
        }

        // Check if we have enough entries
        if let Some(min_size) = min_size {
            if self.entries.len() < min_size {
                return None;
            }
        }

        let next_batch_size = min(self.entries.len(), max_size);

153
154
155
156
        // Create span for this batch to add context to inference calls
        let next_batch_span = info_span!(parent: None, "batch", batch_size = next_batch_size);
        next_batch_span.follows_from(&Span::current());

157
158
159
160
161
162
163
164
        let mut batch_requests = Vec::with_capacity(next_batch_size);
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(next_batch_size, BuildNoHashHasher::default());

        // Drain next_batch_size entries
        self.entries
            .drain(..next_batch_size)
            .for_each(|(id, mut entry)| {
165
166
167
168
169
170
171
172
                // Create a new span to link the batch back to this entry
                let entry_batch_span =
                    info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
                // Add relationship
                entry_batch_span.follows_from(&next_batch_span);
                // Update entry
                entry.temp_span = Some(entry_batch_span);

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
                batch_requests.push(Request {
                    id,
                    inputs: entry.request.inputs.clone(),
                    input_length: entry.request.input_length,
                    parameters: Some(entry.request.parameters.clone()),
                    stopping_parameters: Some(entry.request.stopping_parameters.clone()),
                });
                // Set batch_time
                entry.batch_time = Some(Instant::now());
                // Insert in batch_entries IntMap
                batch_entries.insert(id, entry);
            });

        let batch = Batch {
            id: self.next_batch_id,
            requests: batch_requests,
            size: next_batch_size as u32,
        };
        // Increment batch id
        self.next_batch_id += 1;

194
195
        metrics::gauge!("tgi_queue_size", self.entries.len() as f64);
        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
196
        Some((batch_entries, batch, next_batch_span))
197
198
199
    }
}

200
type NextBatch = (IntMap<u64, Entry>, Batch, Span);
201
202
203

#[derive(Debug)]
enum QueueCommand {
204
    Append(Entry, Span),
205
206
207
208
    NextBatch {
        min_size: Option<usize>,
        max_size: usize,
        response_sender: oneshot::Sender<Option<NextBatch>>,
209
        span: Span,
210
211
212
213
214
215
216
217
218
    },
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::Arc;
    use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
    use tokio::sync::{mpsc, Semaphore};
219
    use tracing::info_span;
220
221
222
223
224
225
226
227
228
229
230
231
232
233

    fn default_entry() -> Entry {
        let semaphore = Arc::new(Semaphore::new(1));
        let (response_tx, _) = mpsc::unbounded_channel();
        let permit = semaphore.try_acquire_owned().unwrap();

        Entry {
            request: ValidGenerateRequest {
                inputs: "".to_string(),
                input_length: 0,
                parameters: NextTokenChooserParameters {
                    temperature: 0.0,
                    top_k: 0,
                    top_p: 0.0,
234
                    typical_p: 0.0,
235
236
237
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 0.0,
238
                    watermark: false,
239
240
241
242
243
244
245
                },
                stopping_parameters: StoppingCriteriaParameters {
                    max_new_tokens: 0,
                    stop_sequences: vec![],
                },
            },
            response_tx,
246
247
248
            span: info_span!("entry"),
            temp_span: None,
            queue_time: Instant::now(),
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
            batch_time: None,
            _permit: permit,
        }
    }

    #[test]
    fn test_append() {
        let mut state = State::new();
        let entry = default_entry();

        assert_eq!(state.next_id, 0);
        assert_eq!(state.entries.len(), 0);

        state.append(entry);

        assert_eq!(state.next_id, 1);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0);
        assert_eq!(id, 0);
    }

    #[test]
    fn test_next_batch_empty() {
        let mut state = State::new();

        assert!(state.next_batch(None, 1).is_none());
        assert!(state.next_batch(Some(1), 1).is_none());
    }

    #[test]
    fn test_next_batch_min_size() {
        let mut state = State::new();
        state.append(default_entry());
        state.append(default_entry());

284
        let (entries, batch, _) = state.next_batch(None, 2).unwrap();
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 1);

        state.append(default_entry());

        assert!(state.next_batch(Some(2), 2).is_none());

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0);
        assert_eq!(id, 2);
    }

    #[test]
    fn test_next_batch_max_size() {
        let mut state = State::new();
        state.append(default_entry());
        state.append(default_entry());

313
        let (entries, batch, _) = state.next_batch(None, 1).unwrap();
314
315
316
317
318
319
320
321
322
323
324
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);

        state.append(default_entry());

325
        let (entries, batch, _) = state.next_batch(None, 3).unwrap();
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 2);
    }

    #[tokio::test]
    async fn test_queue_append() {
        let queue = Queue::new();
        queue.append(default_entry());
    }

    #[tokio::test]
    async fn test_queue_next_batch_empty() {
        let queue = Queue::new();

        assert!(queue.next_batch(None, 1).await.is_none());
        assert!(queue.next_batch(Some(1), 1).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_min_size() {
        let queue = Queue::new();
        queue.append(default_entry());
        queue.append(default_entry());

357
        let (entries, batch, _) = queue.next_batch(None, 2).await.unwrap();
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        queue.append(default_entry());

        assert!(queue.next_batch(Some(2), 2).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_max_size() {
        let queue = Queue::new();
        queue.append(default_entry());
        queue.append(default_entry());

377
        let (entries, batch, _) = queue.next_batch(None, 1).await.unwrap();
378
379
380
381
382
383
384
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        queue.append(default_entry());

385
        let (entries, batch, _) = queue.next_batch(None, 3).await.unwrap();
386
387
388
389
390
391
392
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);
    }
}