Unverified Commit 43df056e authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

[TENSORRT-LLM] - Implement new looper thread based backend (#2357)



* (backend) use parking_lot crate for RwLock fairness

# Conflicts:
#	backends/trtllm/src/backend.rs

* (launcher) default new server::run parameters to false for now

* (chore) fmt ... why?

* (ffi) use const for GetSamplingConfig

* (server) expose new SchedulingError

* (trt)

* (build) setup ccache if available

* (ffi) add max_new_tokens parameters

* (backend) cleanup a bit

* (backend) expose PullNewTokens

* (ffi) cleanup again

* (ffi) add missing headers imports

* (ffi) add template specialization to catch and convert to Rust Result<T, tensorrt_llm::common::TllmException>

* (looper) new looper initial implementation

* (ffi) remove narrowing type warning

* (ffi) encode the provided user prompt within each request thread

* (misc) change scope identifiers

* (backend) implement the post_processor background thread

* (misc) missing Result types for Rust

* use blocking_recv in looper to consume awaiting_requests at max before pulling in a single step

* (server) forward auth_token to server::run

* (build) fetchcontent use archives instead of git

* (ffi) fix usage of wrong vector constructor making a capacity fill call

* (ffi) missing namespace for tle::Response

* (ffi) do not use reference capture in lambda as we are not capturing anything

* (backend) refactor & cleanup

* (Dockerfile.trtllm) delete for now

* (misc) simplify [make_]move_iterator by using c++20 type inference

* (misc) no need to move for uint32_t items

* (scheduler) rework submit/pull logic

* (post) impl postprocessing

* (misc) delete backend.rs

* (misc) rerun-if-changed all the cmake modules

* (misc) move to latest trtllm

* (fix): HOPPER_SM_MAJOR is 9 not 8

* (misc: build for sm_{75,80,86,89,90} by default

* (misc): build with trtllm 0.13.0

* (misc): increase verbosity of spdlog

* (fix): do not recreate the stateful hashmap at every it

* (misc): update dependency in trtllm dockerfile

* (misc): update dependency in trtllm dockerfile

* (misc): disable logging in release mode

* (misc): improve trtllm download script robustness

* (fix): ore fixes for Dockerfile

* misc(cuda): require 12.6

* chore(cmake): use correct policy for download_timestamp

* feat(looper): check engine and executorWorker paths exist before creating the backend

* chore(cmake): download timestamp should be before URL

* feat(looper): minor optimizations to avoid growing too much the containers

* chore(trtllm): move dockerfile to right place

* chore(trtllm): disable tokenizer parallelism by default

* chore(trtllm): fmt

* chore(trtllm): post-rebase commit

* chore(trtllm): remove unused method

* feat(trtllm): cache maxNumTokens to avoid calling JSON everytime

* misc(router): remove SchedulingError

* feat(trtllm): do not tokenize twice

* Revert "chore(trtllm): remove unused method"

This reverts commit 31747163

* chore(rebase): fix invalid references

* chore(router): add python dependency

* Lint.

* Fix bad rebase

---------
Co-authored-by: default avatarNicolas Patry <patry.nicolas@protonmail.com>
parent ed87b464
use std::path::{Path, PathBuf};
use clap::Parser; use clap::Parser;
use std::collections::HashMap; use hf_hub::api::tokio::{Api, ApiBuilder};
use std::path::PathBuf; use hf_hub::{Cache, Repo, RepoType};
use tokenizers::Tokenizer;
use tracing::info;
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
use text_generation_backends_trtllm::TensorRtLlmBackend; use text_generation_backends_trtllm::TensorRtLlmBackendV2;
use text_generation_router::{server, usage_stats}; use text_generation_router::server::get_base_tokenizer;
use tokenizers::{FromPretrainedParameters, Tokenizer}; use text_generation_router::usage_stats::UsageStatsLevel;
use text_generation_router::{server, HubTokenizerConfig};
/// App Configuration /// App Configuration
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
...@@ -58,6 +64,130 @@ struct Args { ...@@ -58,6 +64,130 @@ struct Args {
usage_stats: usage_stats::UsageStatsLevel, usage_stats: usage_stats::UsageStatsLevel,
} }
async fn get_tokenizer(
tokenizer_name: &str,
tokenizer_config_path: Option<&str>,
revision: Option<&str>,
) -> Option<Tokenizer> {
// Parse Huggingface hub token
let authorization_token = std::env::var("HF_TOKEN")
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
.ok();
// Tokenizer instance
let local_path = Path::new(tokenizer_name);
// Shared API builder initialization
let api_builder = || {
let mut builder = ApiBuilder::new()
.with_progress(false)
.with_token(authorization_token);
if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
builder = builder.with_cache_dir(cache_dir.into());
}
builder
};
// Decide if we need to use the API based on the revision and local path
let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
// Initialize API if needed
#[derive(Clone)]
enum Type {
Api(Api),
Cache(Cache),
None,
}
let api = if use_api {
if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
.map_err(|_| ())
.map(|cache_dir| Cache::new(cache_dir.into()))
.unwrap_or_else(|_| Cache::default());
tracing::warn!("Offline mode active using cache defaults");
Type::Cache(cache)
} else {
tracing::info!("Using the Hugging Face API");
match api_builder().build() {
Ok(api) => Type::Api(api),
Err(_) => {
tracing::warn!("Unable to build the Hugging Face API");
Type::None
}
}
}
} else {
Type::None
};
// Load tokenizer and model info
let (
tokenizer_filename,
_config_filename,
tokenizer_config_filename,
_preprocessor_config_filename,
_processor_config_filename,
) = match api {
Type::None => (
Some(local_path.join("tokenizer.json")),
Some(local_path.join("config.json")),
Some(local_path.join("tokenizer_config.json")),
Some(local_path.join("preprocessor_config.json")),
Some(local_path.join("processor_config.json")),
),
Type::Api(api) => {
let api_repo = api.repo(Repo::with_revision(
tokenizer_name.to_string(),
RepoType::Model,
revision.unwrap_or_else(|| "main").to_string(),
));
let tokenizer_filename = match api_repo.get("tokenizer.json").await {
Ok(tokenizer_filename) => Some(tokenizer_filename),
Err(_) => get_base_tokenizer(&api, &api_repo).await,
};
let config_filename = api_repo.get("config.json").await.ok();
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
let processor_config_filename = api_repo.get("processor_config.json").await.ok();
(
tokenizer_filename,
config_filename,
tokenizer_config_filename,
preprocessor_config_filename,
processor_config_filename,
)
}
Type::Cache(cache) => {
let repo = cache.repo(Repo::with_revision(
tokenizer_name.to_string(),
RepoType::Model,
revision.clone().unwrap_or_else(|| "main").to_string(),
));
(
repo.get("tokenizer.json"),
repo.get("config.json"),
repo.get("tokenizer_config.json"),
repo.get("preprocessor_config.json"),
repo.get("processor_config.json"),
)
}
};
// Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
{
HubTokenizerConfig::from_file(filename)
} else {
tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
};
tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
}
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), TensorRtLlmBackendError> { async fn main() -> Result<(), TensorRtLlmBackendError> {
// Get args // Get args
...@@ -124,18 +254,26 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { ...@@ -124,18 +254,26 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
))); )));
} }
// Run server // Create the backend
let tokenizer = Tokenizer::from_pretrained( let tokenizer = get_tokenizer(
tokenizer_name.clone(), &tokenizer_name,
Some(FromPretrainedParameters { tokenizer_config_path.as_deref(),
revision: revision.clone().unwrap_or(String::from("main")), revision.as_deref(),
user_agent: HashMap::new(),
auth_token,
}),
) )
.map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?; .await
.expect("Failed to retrieve tokenizer implementation");
let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?; info!("Successfully retrieved tokenizer {}", &tokenizer_name);
let backend = TensorRtLlmBackendV2::new(
tokenizer,
model_id,
executor_worker,
max_concurrent_requests,
)?;
info!("Successfully created backend");
// Run server
server::run( server::run(
backend, backend,
max_concurrent_requests, max_concurrent_requests,
...@@ -145,7 +283,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { ...@@ -145,7 +283,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
max_input_tokens, max_input_tokens,
max_total_tokens, max_total_tokens,
validation_workers, validation_workers,
None, auth_token,
tokenizer_name, tokenizer_name,
tokenizer_config_path, tokenizer_config_path,
revision, revision,
......
///
/// Extract the first line of the provided string reference.
/// If there is no lines in the buffer, it returns a string
/// which content is defined by the content of `fail`
/// # Arguments
///
/// * `s`: The string buffer to extract the first-line from
/// * `fail`: A string content which is returned if no lines are
/// present in `s`
///
/// returns: String
///
/// # Examples
///
/// ```
/// let s = "My name is Morgan.\n I'm working at Hugging Face.";
/// first_line(s, "No line in string");
/// ```
#[inline]
pub(crate) fn first_line(s: &str, fail: &str) -> String {
s.lines().next().unwrap_or(fail).to_string()
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment