Unverified Commit 7d0c9386 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: Initial Granite support (#1271)

- Add Granite to our tokenizer
- Fix pre-processor to load context length correctly
- Add strftime_now Jinja function for prompt templates
- Update llama.cpp
- Handle trtllm errors when not using trtllm

Support depends on the engine:

- `mistral.rs`, our default engine, doesn't support Granite yet.

- `llama.cpp` does and works very well:
```
dynamo-run out=llamacpp ~/llms/granite-3.3-2b-instruct-Q4_K_M.gguf --context-length 16384
```

- `vllm` also works very well:
```
dynamo-run in=http out=vllm ~/llms/granite-3.3-2b-instruct --context-length 16384
```

- `sglang` mostly works, but it doesn't catch the stop token, so we do in the HTTP ingress, and log an error. The Text ingress doesn't catch it because I disabled it to make the raw echo engine work. A bit of work to do here.

Closes: #1245 
parent d784877f
...@@ -3435,9 +3435,9 @@ checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" ...@@ -3435,9 +3435,9 @@ checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856"
[[package]] [[package]]
name = "llama-cpp-2" name = "llama-cpp-2"
version = "0.1.103" version = "0.1.107"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "401c708926326b1ee410735dc348882c73deeab78f1f89ff2c9caf148356feb4" checksum = "cdf1e72044420c92eb66ec70521cdcfe872b1fe7e7383edd932424d32289105d"
dependencies = [ dependencies = [
"enumflags2", "enumflags2",
"llama-cpp-sys-2", "llama-cpp-sys-2",
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import logging
from dynamo._core import AggregatedMetrics as AggregatedMetrics from dynamo._core import AggregatedMetrics as AggregatedMetrics
try: try:
...@@ -45,3 +47,7 @@ try: ...@@ -45,3 +47,7 @@ try:
) )
except ImportError: except ImportError:
pass # TensorRTLLM is not enabled by default pass # TensorRTLLM is not enabled by default
except Exception as e:
# Don't let TensorRTLLM break other engines
logger = logging.getLogger(__name__)
logger.exception(f"Error importing TensorRT-LLM components: {e}")
...@@ -38,4 +38,4 @@ async-stream = { workspace = true } ...@@ -38,4 +38,4 @@ async-stream = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
llama-cpp-2 = { version = "0.1.103" } llama-cpp-2 = { version = "0.1.107" }
...@@ -58,6 +58,7 @@ pub enum GGUFArchitecture { ...@@ -58,6 +58,7 @@ pub enum GGUFArchitecture {
Qwen2, Qwen2,
Qwen3, Qwen3,
Gemma3, Gemma3,
Granite,
} }
// Wraps from_str() for some convenience: // Wraps from_str() for some convenience:
......
...@@ -115,15 +115,20 @@ impl ModelDeploymentCard { ...@@ -115,15 +115,20 @@ impl ModelDeploymentCard {
} }
async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> { async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
// This is usually the right choice
let context_length = file_json_field( let context_length = file_json_field(
&Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"), &Path::join(&PathBuf::from(repo_id), "config.json"),
"model_max_length", "max_position_embeddings",
) )
// But sometimes this is
.or_else(|_| {
file_json_field(
&Path::join(&PathBuf::from(repo_id), "tokenizer_config.json"),
"model_max_length",
)
})
// If neither of those are present let the engine default it
.unwrap_or(0); .unwrap_or(0);
tracing::trace!(
context_length,
"Loaded context length (model_max_length) from tokenizer_config.json"
);
Ok(Self { Ok(Self {
display_name: model_name.to_string(), display_name: model_name.to_string(),
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
use std::sync::Arc; use std::sync::Arc;
use super::tokcfg::{raise_exception, tojson, ChatTemplate}; use super::tokcfg::{raise_exception, strftime_now, tojson, ChatTemplate};
use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment}; use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
use either::Either; use either::Either;
use minijinja::Environment; use minijinja::Environment;
...@@ -50,10 +50,11 @@ impl HfTokenizerConfigJsonFormatter { ...@@ -50,10 +50,11 @@ impl HfTokenizerConfigJsonFormatter {
// todo: should we use this: minijinja_contrib::add_to_environment(&mut env); // todo: should we use this: minijinja_contrib::add_to_environment(&mut env);
env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback); env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback);
// add custom functions and filters
env.add_function("raise_exception", raise_exception);
env.add_filter("tojson", tojson); env.add_filter("tojson", tojson);
env.add_function("raise_exception", raise_exception);
env.add_function("strftime_now", strftime_now);
let mut supports_add_generation_prompt = None; let mut supports_add_generation_prompt = None;
match &chat_template.0 { match &chat_template.0 {
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
use std::{collections::HashMap, fs::File, path::Path}; use std::{collections::HashMap, fs::File, path::Path};
use chrono::{DateTime, Local};
use either::Either; use either::Either;
use ggus::{GGufMetaKV, GGufReader}; use ggus::{GGufMetaKV, GGufReader};
use memmap2::Mmap; use memmap2::Mmap;
...@@ -225,3 +226,10 @@ pub fn tojson(value: Value, kwargs: Kwargs) -> Result<Value, Error> { ...@@ -225,3 +226,10 @@ pub fn tojson(value: Value, kwargs: Kwargs) -> Result<Value, Error> {
Value::from_safe_string(rv) Value::from_safe_string(rv)
}) })
} }
pub fn strftime_now(format_str: &str) -> Result<Value, Error> {
let local: DateTime<Local> = Local::now();
Ok(Value::from_safe_string(
local.format(format_str).to_string(),
))
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment