feat: add local model card (#216)

65a2dfab · Biswa Panda · GitHub · 7f85dcc3 · 65a2dfab · 65a2dfab
Commit 65a2dfab authored Feb 20, 2025 by Biswa Panda Committed by GitHub Feb 20, 2025
20 changed files
--- a/.github/workflows/copyright-check.ps1
+++ b/.github/workflows/copyright-check.ps1
@@ -122,7 +122,7 @@ $global:copyright_results = @{
 $ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
 write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']."
-$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache')
+$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'llm/rust/triton-llm/tests/data/sample-models')
 write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']."
 $ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
 write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']."

--- a/applications/llm/bin/tio/Cargo.lock
+++ b/applications/llm/bin/tio/Cargo.lock
@@ -687,6 +687,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-targets 0.52.6",
 ]
@@ -5125,6 +5126,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "axum 0.8.1",
+ "blake3",
 "bytes",
 "chrono",
 "derive_builder",

--- a/examples/rust/Cargo.lock
+++ b/examples/rust/Cargo.lock
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 [[package]]
 name = "addr2line"
@@ -464,6 +464,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-targets",
 ]
@@ -3104,6 +3105,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "axum 0.8.1",
+ "blake3",
 "bytes",
 "chrono",
 "derive_builder",

--- a/llm/rust/Cargo.lock
+++ b/llm/rust/Cargo.lock
@@ -712,6 +712,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-targets 0.52.6",
 ]
@@ -4713,9 +4714,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 [[package]]
 name = "tempfile"
-version = "3.16.0"
+version = "3.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91"
+checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
 dependencies = [
 "cfg-if 1.0.0",
 "fastrand",
@@ -5283,6 +5284,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "axum 0.8.1",
+ "blake3",
 "bytes",
 "chrono",
 "derive_builder",
@@ -5298,6 +5300,7 @@ dependencies = [
 "rstest",
 "serde",
 "serde_json",
+ "tempfile",
 "thiserror 2.0.11",
 "tokio",
 "tokio-stream",

--- a/llm/rust/triton-llm/Cargo.toml
+++ b/llm/rust/triton-llm/Cargo.toml
@@ -48,8 +48,16 @@ validator = { workspace = true }
 uuid = { workspace = true }
 xxhash-rust = { workspace = true }
+blake3 = "1"
 # protocols
-chrono = { version = "0.4" }
+chrono = { version = "0.4", default-features = false, features = [
+  "alloc",
+  "std",
+  "clock",
+  "now",
+  "serde",
+] }
 serde_json = { version = "1" }
 regex = "1"
 unicode-segmentation = "1.12"
@@ -67,4 +75,5 @@ mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "5e6
 insta = { version = "1.41", features = ["glob", "json", "redactions"]}
 proptest = "1.5.0"
 reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
 rstest = "0.18.2"
\ No newline at end of file
+tempfile = "3.17.1"
\ No newline at end of file
--- a/llm/rust/triton-llm/src/common.rs
+++ b/llm/rust/triton-llm/src/common.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+pub mod versioned;
--- a/llm/rust/triton-llm/src/common/versioned.rs
+++ b/llm/rust/triton-llm/src/common/versioned.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/// A trait allowing to get/set a revision on an object.
+/// NATS uses this to ensure atomic updates.
+pub trait Versioned {
+    fn revision(&self) -> u64;
+    fn set_revision(&mut self, r: u64);
+}
--- a/llm/rust/triton-llm/src/lib.rs
+++ b/llm/rust/triton-llm/src/lib.rs
@@ -23,3 +23,5 @@ pub mod http;
 pub mod kv_router;
 pub mod protocols;
 pub mod types;
+pub mod model_card;
+pub mod common;
\ No newline at end of file
--- a/llm/rust/triton-llm/src/model_card.rs
+++ b/llm/rust/triton-llm/src/model_card.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+pub mod create;
+pub mod model;
\ No newline at end of file
--- a/llm/rust/triton-llm/src/model_card/create.rs
+++ b/llm/rust/triton-llm/src/model_card/create.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+use std::collections::HashMap;
+use std::path::Path;
+use std::fs;
+use crate::model_card::model::ModelDeploymentCard;
+use anyhow::{Context, Result};
+use crate::model_card::model::{ModelInfoType, TokenizerKind, PromptFormatterArtifact, File};
+impl ModelDeploymentCard {
+    /// Creates a ModelDeploymentCard from a local directory path.
+    ///
+    /// Currently HuggingFace format is supported and following files are expected:
+    /// - config.json: Model configuration in HuggingFace format
+    /// - tokenizer.json: Tokenizer configuration in HuggingFace format
+    /// - tokenizer_config.json: Optional prompt formatter configuration
+    ///
+    /// # Arguments
+    /// * `local_root_dir` - Path to the local model directory
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - The path doesn't exist or isn't a directory
+    /// - The path contains invalid Unicode characters
+    /// - Required model files are missing or invalid
+    pub async fn from_local_path(local_root_dir: impl AsRef<Path>) -> anyhow::Result<Self> {
+        let local_root_dir = local_root_dir.as_ref();
+        check_valid_local_repo_path(local_root_dir)?;
+        let repo_id = local_root_dir
+            .canonicalize()?
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("Path contains invalid Unicode"))?
+            .to_string();
+        let model_name = local_root_dir
+            .file_name()
+            .and_then(|n| n.to_str())
+            .ok_or_else(|| anyhow::anyhow!("Invalid model directory name"))?;
+        Self::from_repo(&repo_id, model_name).await
+    }
+    /// TODO: This will be implemented after nova-hub is integrated with the model-card
+    /// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo
+    pub async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> {
+        Err(anyhow::anyhow!("ModelDeploymentCard::from_ngc_repo is not implemented"))
+    }
+    pub async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
+        Ok(Self {
+            display_name: model_name.to_string(),
+            service_name: model_name.to_string(),
+            model_info: ModelInfoType::from_repo(repo_id).await?,
+            tokenizer: TokenizerKind::from_repo(repo_id).await?,
+            prompt_formatter: PromptFormatterArtifact::from_repo(repo_id).await?,
+            prompt_context: None, // TODO - auto-detect prompt context
+            revision: 0,
+            last_published: None,
+            requires_preprocessing: true,
+        })
+    }
+}
+impl ModelInfoType {
+    pub async fn from_repo(repo_id: &str) -> Result<Self> {
+        Self::try_is_hf_repo(repo_id)
+            .await
+            .with_context(|| format!("unable to extract model info from repo {}", repo_id))
+    }
+    async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
+        Ok(Self::HfConfigJson(
+            check_for_file(repo, "config.json").await?,
+        ))
+    }
+}
+impl PromptFormatterArtifact {
+    pub async fn from_repo(repo_id: &str) -> Result<Option<Self>> {
+        // we should only error if we expect a prompt formatter and it's not found
+        // right now, we don't know when to expect it, so we just return Ok(Some/None)
+        Ok(Self::try_is_hf_repo(repo_id)
+            .await
+            .with_context(|| format!("unable to extract prompt format from repo {}", repo_id))
+            .ok())
+    }
+    async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
+        Ok(Self::HfTokenizerConfigJson(
+            check_for_file(repo, "tokenizer_config.json").await?,
+        ))
+    }
+}
+impl TokenizerKind {
+    pub async fn from_repo(repo_id: &str) -> Result<Self> {
+        Self::try_is_hf_repo(repo_id)
+            .await
+            .with_context(|| format!("unable to extract tokenizer kind from repo {}", repo_id))
+    }
+    async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
+        Ok(Self::HfTokenizerJson(
+            check_for_file(repo, "tokenizer.json").await?,
+        ))
+    }
+}
+/// Checks if the provided path contains the expected file.
+async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<File> {
+    let mut files = check_for_files(repo_id, vec![file.to_string()]).await?;
+    let file = files
+        .remove(file)
+        .ok_or(anyhow::anyhow!("file {} not found", file))?;
+    Ok(file)
+}
+async fn check_for_files(repo_id: &str, files: Vec<String>) -> Result<HashMap<String, File>> {
+    let dir_entries = fs::read_dir(repo_id)
+        .with_context(|| format!("Failed to read directory: {}", repo_id))?;
+    let mut found_files = HashMap::new();
+    for entry in dir_entries {
+        let entry = entry.with_context(|| format!("Failed to read directory entry in {}", repo_id))?;
+        let path = entry.path();
+        let file_name = path
+            .file_name()
+            .and_then(|n| n.to_str())
+            .ok_or_else(|| anyhow::anyhow!("Invalid file name in {}", repo_id))?;
+        if files.contains(&file_name.to_string()) {
+            found_files.insert(
+                file_name.to_string(),
+                path.to_str()
+                    .ok_or_else(|| anyhow::anyhow!("Invalid path"))?
+                    .to_string(),
+            );
+        }
+    }
+    Ok(found_files)
+}
+/// Checks if the provided path is a valid local repository path.
+///
+/// # Arguments
+/// * `path` - Path to validate
+///
+/// # Errors
+/// Returns an error if the path doesn't exist or isn't a directory
+fn check_valid_local_repo_path(path: impl AsRef<Path>) -> Result<()> {
+    let path = path.as_ref();
+    if !path.exists() {
+        return Err(anyhow::anyhow!("Model path does not exist: {}", path.display()));
+    }
+    if !path.is_dir() {
+        return Err(anyhow::anyhow!("Model path is not a directory: {}", path.display()));
+    }
+    Ok(())
+}
\ No newline at end of file
--- a/llm/rust/triton-llm/src/model_card/model.rs
+++ b/llm/rust/triton-llm/src/model_card/model.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//! # Model Deployment Card
+//!
+//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
+//! component that needs to interact with the model or its dependent artifacts.
+//!
+//! The ModelDeploymentCard contains LLM model deployment configuration information:
+//! - Display name and service name for the model
+//! - Model information (ModelInfoType)
+//! - Tokenizer configuration (TokenizerKind)
+//! - Prompt formatter settings (PromptFormatterArtifact)
+//! - Various metadata like revision, publish time, etc.
+use anyhow::Result;
+use either::Either;
+use crate::protocols::TokenIdType;
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+use std::fmt;
+use std::path::Path;
+use std::time::Duration;
+use derive_builder::Builder;
+use triton_distributed::slug::Slug;
+pub const BUCKET_NAME: &str = "mdc";
+/// Delete model deployment cards that haven't been re-published after this long.
+/// Cleans up if the worker stopped.
+pub const BUCKET_TTL: Duration = Duration::from_secs(5 * 60);
+/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
+const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5);
+pub type File = String;
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum ModelInfoType {
+    HfConfigJson(File),
+}
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum TokenizerKind {
+    HfTokenizerJson(File),
+}
+/// Supported types of prompt formatters.
+///
+/// We need a way to associate the prompt formatter template definition with an associated
+/// data model which is expected for rendering.
+///
+/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
+/// format. However, we currently do not have a discovery path to know if the model supports tool use
+/// unless we inspect the template.
+///
+/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
+/// - OaiChat
+/// - OaiChatToolUse
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum PromptFormatterArtifact {
+    HfTokenizerConfigJson(File),
+}
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)]
+#[serde(rename_all = "snake_case")]
+pub enum PromptContextMixin {
+    /// Support OAI Chat Messages and Tools
+    OaiChat,
+    /// Enables templates with `{{datatime}}` to be rendered with the current date and time.
+    Llama3DateTime,
+}
+#[derive(Serialize, Deserialize, Clone, Debug, Builder)]
+pub struct ModelDeploymentCard {
+    /// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
+    pub display_name: String,
+    /// Identifier to expect in OpenAI compatible HTTP request, e.g. "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    /// This will get slugified for use in NATS.
+    pub service_name: String,
+    /// Model information
+    pub model_info: ModelInfoType,
+    /// Tokenizer configuration
+    pub tokenizer: TokenizerKind,
+    /// Prompt Formatter configuration
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prompt_formatter: Option<PromptFormatterArtifact>,
+    /// Prompt Formatter Config
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prompt_context: Option<Vec<PromptContextMixin>>,
+    /// When this card was last advertised by a worker. None if not yet published.
+    pub last_published: Option<chrono::DateTime<chrono::Utc>>,
+    /// Incrementing count of how many times we published this card
+    #[serde(default, skip_serializing)]
+    pub revision: u64,
+    /// Does this model expect preprocessing (tokenization, etc) to be already done?
+    /// If this is true they get a BackendInput JSON. If this is false they get
+    /// a ChatCompletionRequest JSON.
+    #[serde(default)]
+    pub requires_preprocessing: bool,
+}
+impl ModelDeploymentCard {
+    pub fn builder() -> ModelDeploymentCardBuilder {
+        ModelDeploymentCardBuilder::default()
+    }
+    /// A URL and NATS friendly and very likely unique ID for this model.
+    /// Mostly human readable. a-z, 0-9, _ and - only.
+    /// Pass the service_name.
+    pub fn service_name_slug(s: &str) -> Slug {
+        Slug::from_string(s)
+    }
+    pub fn set_service_name(&mut self, service_name: &str) {
+        self.service_name = service_name.to_string();
+    }
+    /// How often we should check if a model deployment card expired because it's workers are gone
+    pub fn expiry_check_period() -> Duration {
+        match CARD_MAX_AGE.to_std() {
+            Ok(duration) => duration / 3,
+            Err(_) => {
+                // Only happens if CARD_MAX_AGE is negative, which it isn't
+                unreachable!("Cannot run card expiry watcher, invalid CARD_MAX_AGE");
+            }
+        }
+    }
+    pub fn slug(&self) -> Slug {
+        ModelDeploymentCard::service_name_slug(&self.service_name)
+    }
+    /// Load a model deployment card from a JSON file
+    pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
+        let mut card: ModelDeploymentCard = serde_json::from_str(&std::fs::read_to_string(file)?)?;
+        card.requires_preprocessing = false;
+        Ok(card)
+    }
+    /// Load a model deployment card from a JSON string
+    pub fn load_from_json_str(json: &str) -> Result<Self, anyhow::Error> {
+        Ok(serde_json::from_str(json)?)
+    }
+    /// Save the model deployment card to a JSON file
+    pub fn save_to_json_file(&self, file: &str) -> Result<(), anyhow::Error> {
+        std::fs::write(file, self.to_json()?)?;
+        Ok(())
+    }
+    /// Serialize the model deployment card to a JSON string
+    pub fn to_json(&self) -> Result<String, anyhow::Error> {
+        Ok(serde_json::to_string(self)?)
+    }
+    pub fn mdcsum(&self) -> String {
+        let json = self.to_json().unwrap();
+        format!("{}", blake3::hash(json.as_bytes()))
+    }
+    /// Was this card last published a long time ago, suggesting the worker is gone?
+    pub fn is_expired(&self) -> bool {
+        if let Some(last_published) = self.last_published.as_ref() {
+            chrono::Utc::now() - last_published > CARD_MAX_AGE
+        } else {
+            false
+        }
+    }
+}
+impl fmt::Display for ModelDeploymentCard {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.slug())
+    }
+}
+pub trait ModelInfo: Send + Sync {
+    /// Model type
+    fn model_type(&self) -> String;
+    /// Token ID for the beginning of sequence
+    fn bos_token_id(&self) -> TokenIdType;
+    /// Token ID for the end of sequence
+    fn eos_token_ids(&self) -> Vec<TokenIdType>;
+    /// Maximum position embeddings / max sequence length
+    fn max_position_embeddings(&self) -> usize;
+    /// Vocabulary size
+    fn vocab_size(&self) -> usize;
+}
+impl ModelInfoType {
+    pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> {
+        match self {
+            Self::HfConfigJson(info) => HFConfigJsonFile::from_file(info).await,
+        }
+    }
+}
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct HFConfigJsonFile {
+    bos_token_id: TokenIdType,
+    #[serde(with = "either::serde_untagged")]
+    eos_token_id: Either<TokenIdType, Vec<TokenIdType>>,
+    /// denotes the mixin to the flattened data model which can be present
+    /// in the config.json file
+    architectures: Vec<String>,
+    /// general model type
+    model_type: String,
+    /// max sequence length
+    max_position_embeddings: usize,
+    /// number of layers in the model
+    num_hidden_layers: usize,
+    /// number of attention heads in the model
+    num_attention_heads: usize,
+    /// Vocabulary size
+    vocab_size: usize,
+}
+impl HFConfigJsonFile {
+    async fn from_file(file: &File) -> Result<Arc<dyn ModelInfo>> {
+        let contents = std::fs::read_to_string(&file)?;
+        let config: Self = serde_json::from_str(&contents)?;
+        Ok(Arc::new(config))
+    }
+}
+impl ModelInfo for HFConfigJsonFile {
+    fn model_type(&self) -> String {
+        self.model_type.clone()
+    }
+    fn bos_token_id(&self) -> TokenIdType {
+        self.bos_token_id
+    }
+    fn eos_token_ids(&self) -> Vec<TokenIdType> {
+        match &self.eos_token_id {
+            Either::Left(eos_token_id) => vec![*eos_token_id],
+            Either::Right(eos_token_ids) => eos_token_ids.clone(),
+        }
+    }
+    fn max_position_embeddings(&self) -> usize {
+        self.max_position_embeddings
+    }
+    fn vocab_size(&self) -> usize {
+        self.vocab_size
+    }
+}
--- a/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json
+++ b/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128009,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
--- a/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/generation_config.json
+++ b/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/generation_config.json
+{
+  "bos_token_id": 128000,
+  "eos_token_id": [128001, 128009],
+  "do_sample": true,
+  "temperature": 0.6,
+  "max_length": 4096,
+  "top_p": 0.9,
+  "transformers_version": "4.40.0.dev0"
+}
--- a/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer.json
+++ b/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer.json
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 128000,
+      "content": "<|begin_of_text|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128001,
+      "content": "<|end_of_text|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128002,
+      "content": "<|reserved_special_token_0|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128003,
+      "content": "<|reserved_special_token_1|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128004,
+      "content": "<|reserved_special_token_2|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128005,
+      "content": "<|reserved_special_token_3|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128006,
+      "content": "<|start_header_id|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128007,
+      "content": "<|end_header_id|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128008,
+      "content": "<|reserved_special_token_4|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128009,
+      "content": "<|eot_id|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128010,
+      "content": "<|reserved_special_token_5|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": false,
+        "trim_offsets": true,
+        "use_regex": false
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "Sequence",
+    "processors": [
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": true,
+        "trim_offsets": false,
+        "use_regex": true
+      },
+      {
+        "type": "TemplateProcessing",
+        "single": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          }
+        ],
+        "pair": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          },
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 1
+            }
+          },
+          {
+            "Sequence": {
+              "id": "B",
+              "type_id": 1
+            }
+          }
+        ],
+        "special_tokens": {
+          "<|begin_of_text|>": {
+            "id": "<|begin_of_text|>",
+            "ids": [
+              128000
+            ],
+            "tokens": [
+              "<|begin_of_text|>"
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": true,
+    "vocab": {},
+    "merges": []
+  }
+}
\ No newline at end of file
--- a/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer_config.json
+++ b/llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer_config.json
+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim %}{% if loop.first %}{% set content = bos_token + content %}{% endif %}{% if not loop.last %}{% set content = content + '<|eot_id|>'%}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}
--- a/llm/rust/triton-llm/tests/model_card.rs
+++ b/llm/rust/triton-llm/tests/model_card.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+use triton_llm::model_card::model::{ModelDeploymentCard, ModelInfoType, TokenizerKind, PromptFormatterArtifact};
+use tempfile::tempdir;
+#[tokio::test]
+async fn test_model_info_from_hf_like_local_repo() {
+    let path = "tests/data/sample-models/mock-llama-3.1-8b-instruct";
+    let mdc = ModelDeploymentCard::from_local_path(path).await.unwrap();
+    let info = mdc.model_info.get_model_info().await.unwrap();
+    assert_eq!(info.model_type(), "llama");
+    assert_eq!(info.bos_token_id(), 128000);
+    assert_eq!(info.eos_token_ids(), vec![128009]);
+    assert_eq!(info.max_position_embeddings(), 8192);
+    assert_eq!(info.vocab_size(), 128256);
+}
+#[tokio::test]
+async fn test_model_info_from_non_existent_local_repo() {
+    let path = "tests/data/sample-models/this-model-does-not-exist";
+    let result = ModelDeploymentCard::from_local_path(path).await;
+    assert!(result.is_err());
+}
+#[tokio::test]
+async fn test_tokenizer_from_hf_like_local_repo() {
+    let path = "tests/data/sample-models/mock-llama-3.1-8b-instruct";
+    let mdc = ModelDeploymentCard::from_local_path(path).await.unwrap();
+    // Verify tokenizer file was found
+    match mdc.tokenizer {
+        TokenizerKind::HfTokenizerJson(_) => (),
+        _ => panic!("Expected HfTokenizerJson"),
+    }
+}
+#[tokio::test]
+async fn test_prompt_formatter_from_hf_like_local_repo() {
+    let path = "tests/data/sample-models/mock-llama-3.1-8b-instruct";
+    let mdc = ModelDeploymentCard::from_local_path(path).await.unwrap();
+    // Verify prompt formatter was found
+    match mdc.prompt_formatter {
+        Some(PromptFormatterArtifact::HfTokenizerConfigJson(_)) => (),
+        _ => panic!("Expected HfTokenizerConfigJson prompt formatter"),
+    }
+}
+#[tokio::test]
+async fn test_missing_required_files() {
+    // Create empty temp directory
+    let temp_dir = tempdir().unwrap();
+    let result = ModelDeploymentCard::from_local_path(temp_dir.path()).await;
+    assert!(result.is_err());
+    let err = result.unwrap_err().to_string();
+    // Should fail because config.json is missing
+    assert!(err.contains("unable to extract"));
+}
\ No newline at end of file
--- a/python-wheel/Cargo.lock
+++ b/python-wheel/Cargo.lock
@@ -469,6 +469,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-targets",
 ]
@@ -3189,6 +3190,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "axum 0.8.1",
+ "blake3",
 "bytes",
 "chrono",
 "derive_builder",

--- a/runtime/rust/src/lib.rs
+++ b/runtime/rust/src/lib.rs
@@ -38,11 +38,11 @@ pub mod protocols;
 pub mod runnable;
 pub mod runtime;
 pub mod service;
+pub mod slug;
 pub mod transports;
 pub mod worker;
 pub mod distributed;
 pub use futures::stream;
 pub use tokio_util::sync::CancellationToken;
 pub use worker::Worker;

--- a/runtime/rust/src/transports/nats/slug.rs
+++ b/runtime/rust/src/transports/nats/slug.rs
--- a/runtime/rust/src/transports/nats.rs
+++ b/runtime/rust/src/transports/nats.rs
@@ -38,8 +38,7 @@ use std::path::PathBuf;
 use tokio::time;
 use validator::{Validate, ValidationError};
-mod slug;
+pub use crate::slug::Slug;
-pub use slug::Slug;
 use tracing as log;
 #[derive(Clone)]