feat: Initial Granite support (#1271)

- Add Granite to our tokenizer - Fix pre-processor to load context length correctly - Add strftime_now Jinja function for prompt templates - Update llama.cpp - Handle trtllm errors when not using trtllm Support depends on the engine: - `mistral.rs`, our default engine, doesn't support Granite yet. - `llama.cpp` does and works very well: ``` dynamo-run out=llamacpp ~/llms/granite-3.3-2b-instruct-Q4_K_M.gguf --context-length 16384 ``` - `vllm` also works very well: ``` dynamo-run in=http out=vllm ~/llms/granite-3.3-2b-instruct --context-length 16384 ``` - `sglang` mostly works, but it doesn't catch the stop token, so we do in the HTTP ingress, and log an error. The Text ingress doesn't catch it because I disabled it to make the raw echo engine work. A bit of work to do here. Closes: #1245

feat: Initial Granite support (#1271)
- Add Granite to our tokenizer - Fix pre-processor to load context length correctly - Add strftime_now Jinja function for prompt templates - Update llama.cpp - Handle trtllm errors when not using trtllm Support depends on the engine: - `mistral.rs`, our default engine, doesn't support Granite yet. - `llama.cpp` does and works very well: ``` dynamo-run out=llamacpp ~/llms/granite-3.3-2b-instruct-Q4_K_M.gguf --context-length 16384 ``` - `vllm` also works very well: ``` dynamo-run in=http out=vllm ~/llms/granite-3.3-2b-instruct --context-length 16384 ``` - `sglang` mostly works, but it doesn't catch the stop token, so we do in the HTTP ingress, and log an error. The Text ingress doesn't catch it because I disabled it to make the raw echo engine work. A bit of work to do here. Closes: #1245
7d0c9386 · Graham King · GitHub · d784877f · 7d0c9386 · 7d0c9386
Unverified Commit 7d0c9386 authored May 29, 2025 by Graham King Committed by GitHub May 29, 2025
7 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3435,9 +3435,9 @@ checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856"
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.103"
+version = "0.1.107"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "401c708926326b1ee410735dc348882c73deeab78f1f89ff2c9caf148356feb4"
+checksum = "cdf1e72044420c92eb66ec70521cdcfe872b1fe7e7383edd932424d32289105d"
 dependencies = [
 "enumflags2",
 "llama-cpp-sys-2",

--- a/lib/bindings/python/src/dynamo/llm/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/__init__.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from dynamo._core import AggregatedMetrics as AggregatedMetrics
 try:
@@ -45,3 +47,7 @@ try:
    )
 except ImportError:
    pass  # TensorRTLLM is not enabled by default
+except Exception as e:
+    # Don't let TensorRTLLM break other engines
+    logger = logging.getLogger(__name__)
+    logger.exception(f"Error importing TensorRT-LLM components: {e}")
--- a/lib/engines/llamacpp/Cargo.toml
+++ b/lib/engines/llamacpp/Cargo.toml
@@ -38,4 +38,4 @@ async-stream = { workspace = true }
 tokio = { workspace = true }
 tracing = { workspace = true }
-llama-cpp-2 = { version = "0.1.103" }
+llama-cpp-2 = { version = "0.1.107" }
--- a/lib/llm/src/gguf.rs
+++ b/lib/llm/src/gguf.rs
@@ -58,6 +58,7 @@ pub enum GGUFArchitecture {
    Qwen2,
    Qwen3,
    Gemma3,
+    Granite,
 }
 // Wraps from_str() for some convenience:

--- a/lib/llm/src/model_card/create.rs
+++ b/lib/llm/src/model_card/create.rs
@@ -115,15 +115,20 @@ impl ModelDeploymentCard {
    }
    async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
+        // This is usually the right choice
        let context_length = file_json_field(
-            &Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
+            &Path::join(&PathBuf::from(repo_id), "config.json"),
-            "model_max_length",
+            "max_position_embeddings",
        )
+        // But sometimes this is
+        .or_else(|_| {
+            file_json_field(
+                &Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
+                "model_max_length",
+            )
+        })
+        // If neither of those are present let the engine default it
        .unwrap_or(0);
-        tracing::trace!(
-            context_length,
-            "Loaded context length (model_max_length) from tokenizer_config.json"
-        );
        Ok(Self {
            display_name: model_name.to_string(),

--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -15,7 +15,7 @@
 use std::sync::Arc;
-use super::tokcfg::{raise_exception, tojson, ChatTemplate};
+use super::tokcfg::{raise_exception, strftime_now, tojson, ChatTemplate};
 use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
 use either::Either;
 use minijinja::Environment;
@@ -50,10 +50,11 @@ impl HfTokenizerConfigJsonFormatter {
        // todo: should we use this: minijinja_contrib::add_to_environment(&mut env);
        env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback);
-        // add custom functions and filters
-        env.add_function("raise_exception", raise_exception);
        env.add_filter("tojson", tojson);
+        env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
        let mut supports_add_generation_prompt = None;
        match &chat_template.0 {

--- a/lib/llm/src/preprocessor/prompt/template/tokcfg.rs
+++ b/lib/llm/src/preprocessor/prompt/template/tokcfg.rs
@@ -17,6 +17,7 @@
 use std::{collections::HashMap, fs::File, path::Path};
+use chrono::{DateTime, Local};
 use either::Either;
 use ggus::{GGufMetaKV, GGufReader};
 use memmap2::Mmap;
@@ -225,3 +226,10 @@ pub fn tojson(value: Value, kwargs: Kwargs) -> Result<Value, Error> {
        Value::from_safe_string(rv)
    })
 }
+pub fn strftime_now(format_str: &str) -> Result<Value, Error> {
+    let local: DateTime<Local> = Local::now();
+    Ok(Value::from_safe_string(
+        local.format(format_str).to_string(),
+    ))
+}