Unverified Commit 92f06b0e authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(dynamo-run): Refactor to library (#1687)

Move much of what was in the `dynamo-run` crate into `dynamo-llm` so that everyone can use it.

Example usage:

1. Create a `LocalModel`:

```
    let local_model = LocalModelBuilder::default()
	.model_path("Qwen/Qwen3-0.6B")
	.http_port(8080)
	.build().await?;
```

2. Make an engine:

```
    let engine_config = EngineConfig::StaticFull {
	engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
	model: Box::new(local_model),
    };
```

3. Connect it to an input and run it

```
    dynamo_llm::entrypoint::input::run_input(Input::Http, runtime, engine_config).await?;
```

For https://github.com/ai-dynamo/dynamo/issues/1647

Code Rabbit summary, thanks:
  * Introduced a flexible builder pattern for local model configuration, allowing advanced customization and easier initialization.
  * Added new input modes and unified input handling, supporting interactive chat, HTTP server, batch file, and distributed endpoint modes.
  * Centralized engine configuration and routing, enabling more extensible and maintainable engine management.
  * Simplified and modularized the codebase by moving input and engine logic into dedicated modules.
  * Replaced direct model construction with an asynchronous builder for improved clarity and extensibility.
  * Streamlined configuration and validation for flags and router settings.
  * Added validation to prevent incompatible input and output combinations in endpoint and dynamic modes.
parent 3b62692f
...@@ -155,11 +155,7 @@ impl Tokens { ...@@ -155,11 +155,7 @@ impl Tokens {
/// ///
/// * `block_size` - The fixed size for each [`TokenBlock`]. /// * `block_size` - The fixed size for each [`TokenBlock`].
/// * `salt_hash` - An optional [`SaltHash`] used as the base seed for hashing. Defaults to 0. /// * `salt_hash` - An optional [`SaltHash`] used as the base seed for hashing. Defaults to 0.
pub fn into_sequence( pub fn into_sequence(self, block_size: u32, salt_hash: Option<SaltHash>) -> TokenBlockSequence {
self,
block_size: usize,
salt_hash: Option<SaltHash>,
) -> TokenBlockSequence {
TokenBlockSequence::new(self, block_size, salt_hash) TokenBlockSequence::new(self, block_size, salt_hash)
} }
} }
...@@ -191,7 +187,7 @@ pub enum TokenBlockError { ...@@ -191,7 +187,7 @@ pub enum TokenBlockError {
#[derive(Debug, PartialEq)] // No Clone: intended to be unique within a sequence #[derive(Debug, PartialEq)] // No Clone: intended to be unique within a sequence
pub struct PartialTokenBlock { pub struct PartialTokenBlock {
tokens: Tokens, tokens: Tokens,
block_size: usize, block_size: u32,
salt_hash: SaltHash, salt_hash: SaltHash,
parent_sequence_hash: Option<SequenceHash>, parent_sequence_hash: Option<SequenceHash>,
} }
...@@ -203,7 +199,7 @@ impl PartialTokenBlock { ...@@ -203,7 +199,7 @@ impl PartialTokenBlock {
/// ///
/// * `block_size` - The fixed size for blocks in this sequence. /// * `block_size` - The fixed size for blocks in this sequence.
/// * `salt_hash` - The [`SaltHash`] for the sequence. /// * `salt_hash` - The [`SaltHash`] for the sequence.
pub(crate) fn create_sequence_root(block_size: usize, salt_hash: SaltHash) -> Self { pub(crate) fn create_sequence_root(block_size: u32, salt_hash: SaltHash) -> Self {
Self { Self {
tokens: Tokens::default(), tokens: Tokens::default(),
block_size, block_size,
...@@ -223,7 +219,7 @@ impl PartialTokenBlock { ...@@ -223,7 +219,7 @@ impl PartialTokenBlock {
/// * `Ok(())` - If the token was successfully added. /// * `Ok(())` - If the token was successfully added.
/// * `Err(TokenBlockError::Full)` - If the block already contains `block_size` tokens. /// * `Err(TokenBlockError::Full)` - If the block already contains `block_size` tokens.
pub(crate) fn push_token(&mut self, token: Token) -> Result<(), TokenBlockError> { pub(crate) fn push_token(&mut self, token: Token) -> Result<(), TokenBlockError> {
if self.tokens.0.len() >= self.block_size { if self.tokens.0.len() >= self.block_size as usize {
return Err(TokenBlockError::Full); return Err(TokenBlockError::Full);
} }
self.tokens.0.push(token); self.tokens.0.push(token);
...@@ -305,7 +301,7 @@ impl PartialTokenBlock { ...@@ -305,7 +301,7 @@ impl PartialTokenBlock {
/// * `Ok(TokenBlock)` - The newly created full [`TokenBlock`]. /// * `Ok(TokenBlock)` - The newly created full [`TokenBlock`].
/// * `Err(TokenBlockError::Incomplete)` - If the block does not contain exactly `block_size` tokens. /// * `Err(TokenBlockError::Incomplete)` - If the block does not contain exactly `block_size` tokens.
pub(crate) fn commit(&mut self) -> Result<TokenBlock, TokenBlockError> { pub(crate) fn commit(&mut self) -> Result<TokenBlock, TokenBlockError> {
if self.tokens.0.len() != self.block_size { if self.tokens.0.len() != self.block_size as usize {
// Check for exact size match for committing // Check for exact size match for committing
return Err(TokenBlockError::Incomplete); return Err(TokenBlockError::Incomplete);
} }
...@@ -327,7 +323,7 @@ impl PartialTokenBlock { ...@@ -327,7 +323,7 @@ impl PartialTokenBlock {
/// Returns the number of additional tokens required to fill the block. /// Returns the number of additional tokens required to fill the block.
pub fn remaining(&self) -> usize { pub fn remaining(&self) -> usize {
// Use saturating_sub to prevent underflow if len somehow exceeds block_size // Use saturating_sub to prevent underflow if len somehow exceeds block_size
self.block_size.saturating_sub(self.tokens.0.len()) (self.block_size as usize).saturating_sub(self.tokens.0.len())
} }
/// Returns the number of tokens currently in the block. /// Returns the number of tokens currently in the block.
...@@ -408,7 +404,7 @@ impl TokenBlock { ...@@ -408,7 +404,7 @@ impl TokenBlock {
pub fn next_block(&self) -> PartialTokenBlock { pub fn next_block(&self) -> PartialTokenBlock {
PartialTokenBlock { PartialTokenBlock {
tokens: Tokens::default(), tokens: Tokens::default(),
block_size: self.tokens.len(), // Should be == self.block_size block_size: self.tokens.len() as u32, // Should be == self.block_size
salt_hash: self.salt_hash, salt_hash: self.salt_hash,
parent_sequence_hash: Some(self.sequence_hash), // Link to this block parent_sequence_hash: Some(self.sequence_hash), // Link to this block
} }
...@@ -500,7 +496,7 @@ impl TokenBlockSequence { ...@@ -500,7 +496,7 @@ impl TokenBlockSequence {
/// # Panics /// # Panics
/// ///
/// Panics if `block_size` is 0. /// Panics if `block_size` is 0.
pub fn new(tokens: Tokens, block_size: usize, salt_hash: Option<SaltHash>) -> Self { pub fn new(tokens: Tokens, block_size: u32, salt_hash: Option<SaltHash>) -> Self {
assert!(block_size > 0, "block_size must be greater than 0"); assert!(block_size > 0, "block_size must be greater than 0");
let salt_hash = salt_hash.unwrap_or(0); let salt_hash = salt_hash.unwrap_or(0);
let (blocks, current_block) = Self::split_tokens(&tokens, block_size, salt_hash); let (blocks, current_block) = Self::split_tokens(&tokens, block_size, salt_hash);
...@@ -640,7 +636,7 @@ impl TokenBlockSequence { ...@@ -640,7 +636,7 @@ impl TokenBlockSequence {
let tokens_to_pop_from_blocks = n - current_len; let tokens_to_pop_from_blocks = n - current_len;
// Calculate how many blocks are affected (including the one partially popped) // Calculate how many blocks are affected (including the one partially popped)
let num_blocks_to_affect = tokens_to_pop_from_blocks.div_ceil(block_size); let num_blocks_to_affect = tokens_to_pop_from_blocks.div_ceil(block_size as usize);
// Check if we need to pop more blocks than available (should be prevented by initial len check) // Check if we need to pop more blocks than available (should be prevented by initial len check)
if num_blocks_to_affect > self.blocks.len() { if num_blocks_to_affect > self.blocks.len() {
...@@ -657,10 +653,10 @@ impl TokenBlockSequence { ...@@ -657,10 +653,10 @@ impl TokenBlockSequence {
// Calculate how many tokens to keep from that source block // Calculate how many tokens to keep from that source block
let num_full_blocks_completely_popped = num_blocks_to_affect - 1; let num_full_blocks_completely_popped = num_blocks_to_affect - 1;
let num_tokens_to_pop_from_source_block = let num_tokens_to_pop_from_source_block = tokens_to_pop_from_blocks
tokens_to_pop_from_blocks - num_full_blocks_completely_popped * block_size; - num_full_blocks_completely_popped * block_size as usize;
let num_tokens_to_keep_in_new_partial = let num_tokens_to_keep_in_new_partial =
block_size.saturating_sub(num_tokens_to_pop_from_source_block); (block_size as usize).saturating_sub(num_tokens_to_pop_from_source_block);
// Get the tokens for the new partial block // Get the tokens for the new partial block
let new_partial_tokens = if num_tokens_to_keep_in_new_partial > 0 { let new_partial_tokens = if num_tokens_to_keep_in_new_partial > 0 {
...@@ -789,7 +785,7 @@ impl TokenBlockSequence { ...@@ -789,7 +785,7 @@ impl TokenBlockSequence {
/// Returns the total number of tokens in the sequence (sum of tokens in all completed blocks /// Returns the total number of tokens in the sequence (sum of tokens in all completed blocks
/// plus tokens in the current partial block). /// plus tokens in the current partial block).
pub fn total_tokens(&self) -> usize { pub fn total_tokens(&self) -> usize {
let block_size = self.current_block.block_size; let block_size = self.current_block.block_size as usize;
(self.blocks.len() * block_size) + self.current_block.len() (self.blocks.len() * block_size) + self.current_block.len()
} }
...@@ -812,14 +808,14 @@ impl TokenBlockSequence { ...@@ -812,14 +808,14 @@ impl TokenBlockSequence {
/// Panics if `block_size` is 0. /// Panics if `block_size` is 0.
pub fn split_tokens( pub fn split_tokens(
tokens: &[Token], tokens: &[Token],
block_size: usize, block_size: u32,
salt_hash: u64, salt_hash: u64,
) -> (Vec<TokenBlock>, PartialTokenBlock) { ) -> (Vec<TokenBlock>, PartialTokenBlock) {
assert!(block_size > 0, "block_size must be greater than 0"); assert!(block_size > 0, "block_size must be greater than 0");
// Use Rayon for parallel computation of block chunks (hashes) // Use Rayon for parallel computation of block chunks (hashes)
let chunks: Vec<TokenBlockChunk> = tokens let chunks: Vec<TokenBlockChunk> = tokens
.as_ref() .as_ref()
.par_chunks_exact(block_size) .par_chunks_exact(block_size as usize)
.map(|chunk| TokenBlockChunk::from_tokens(chunk, salt_hash)) .map(|chunk| TokenBlockChunk::from_tokens(chunk, salt_hash))
.collect(); .collect();
...@@ -834,7 +830,10 @@ impl TokenBlockSequence { ...@@ -834,7 +830,10 @@ impl TokenBlockSequence {
} }
// Handle any remaining tokens // Handle any remaining tokens
let remainder = tokens.as_ref().chunks_exact(block_size).remainder(); let remainder = tokens
.as_ref()
.chunks_exact(block_size as usize)
.remainder();
let current_block = PartialTokenBlock { let current_block = PartialTokenBlock {
tokens: remainder.into(), tokens: remainder.into(),
...@@ -856,7 +855,7 @@ mod tests { ...@@ -856,7 +855,7 @@ mod tests {
// Helper to create a sequence for testing // Helper to create a sequence for testing
fn create_test_sequence( fn create_test_sequence(
initial_tokens: &[Token], initial_tokens: &[Token],
block_size: usize, block_size: u32,
salt_hash: Option<SaltHash>, salt_hash: Option<SaltHash>,
) -> TokenBlockSequence { ) -> TokenBlockSequence {
TokenBlockSequence::new(Tokens::from(initial_tokens), block_size, salt_hash) TokenBlockSequence::new(Tokens::from(initial_tokens), block_size, salt_hash)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment