Unverified Commit 7b076cfb authored by Muthuraj Ramalingakumar's avatar Muthuraj Ramalingakumar Committed by GitHub
Browse files

feat: Parallelize tokenization during batch completion (#1657)

parent 9d7624f1
......@@ -29,6 +29,7 @@ pub mod tools;
use anyhow::Result;
use futures::stream::{self, StreamExt};
use prompt::OAIPromptFormatter;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::{collections::HashMap, sync::Arc};
use tracing;
......@@ -220,13 +221,15 @@ impl OpenAIPreprocessor {
builder.token_ids(encoding.token_ids);
}
TextInput::Batch(texts) => {
let mut token_batches = Vec::new();
// TODO: room for optimization here
for text in texts {
let encoding =
tokio::task::block_in_place(|| self.tokenizer.encode(&text))?;
token_batches.push(encoding.token_ids);
}
let token_batches: Result<Vec<Vec<u32>>, _> = texts
.par_iter()
.map(|text| {
tokio::task::block_in_place(|| self.tokenizer.encode(text))
.map(|encoding| encoding.token_ids)
})
.collect();
let token_batches = token_batches?;
builder.batch_token_ids(Some(token_batches));
builder.token_ids(vec![]);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment