Unverified Commit 1954fcfa authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove service_name from ModelDeploymentCard (#2349)

parent ccc8815b
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use super::*;
use llm_rs::model_card::model::ModelDeploymentCard as RsModelDeploymentCard;
use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard;
#[pyclass]
#[derive(Clone)]
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Backend
//!
......@@ -33,7 +21,7 @@ use anyhow::{Error, Result};
use futures::stream::{self, StreamExt};
use tracing as log;
use crate::model_card::model::{ModelDeploymentCard, TokenizerKind};
use crate::model_card::{ModelDeploymentCard, TokenizerKind};
use dynamo_runtime::{
pipeline::{
async_trait, AsyncEngineContextProvider, ManyOut, Operator, ResponseStream,
......
......@@ -251,12 +251,15 @@ impl LocalModel {
&self.full_path
}
/// Human friendly model name. This is the correct name.
pub fn display_name(&self) -> &str {
&self.card.display_name
}
/// The name under which we make this model available over HTTP.
/// A slugified version of the model's name, for use in NATS, etcd, etc.
pub fn service_name(&self) -> &str {
&self.card.service_name
self.card.slug().as_ref()
}
pub fn request_template(&self) -> Option<RequestTemplate> {
......
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
......@@ -23,7 +11,7 @@ use async_nats::client::{
};
use crate::{
model_card::model::ModelDeploymentCard,
model_card::ModelDeploymentCard,
protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
};
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod create;
pub mod model;
pub use model::ModelDeploymentCard;
//! # Model Deployment Card
//!
//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
//! component that needs to interact with the model or its dependent artifacts.
//!
//! The ModelDeploymentCard contains LLM model deployment configuration information:
//! - Display name and service name for the model
//! - Model information (ModelInfoType)
//! - Tokenizer configuration (TokenizerKind)
//! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc.
use std::fmt;
use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, Result};
use derive_builder::Builder;
use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports::nats};
use serde::{Deserialize, Serialize};
use tokenizers::Tokenizer as HfTokenizer;
use url::Url;
use crate::gguf::{Content, ContentConfig, ModelConfigLike};
use crate::protocols::TokenIdType;
/// Identify model deployment cards in the key-value store
pub const ROOT_PATH: &str = "mdc";
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5);
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum ModelInfoType {
HfConfigJson(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum TokenizerKind {
HfTokenizerJson(String),
GGUF(Box<HfTokenizer>),
}
/// Supported types of prompt formatters.
///
/// We need a way to associate the prompt formatter template definition with an associated
/// data model which is expected for rendering.
///
/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
/// format. However, we currently do not have a discovery path to know if the model supports tool use
/// unless we inspect the template.
///
/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
/// - OaiChat
/// - OaiChatToolUse
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum PromptFormatterArtifact {
HfTokenizerConfigJson(String),
HfChatTemplate(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case")]
pub enum PromptContextMixin {
/// Support OAI Chat Messages and Tools
OaiChat,
/// Enables templates with `{{datetime}}` to be rendered with the current date and time.
Llama3DateTime,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum GenerationConfig {
HfGenerationConfigJson(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug, Builder, Default)]
pub struct ModelDeploymentCard {
/// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
pub display_name: String,
// Cache the Slugified display_name so we can share references to it
slug: Slug,
/// Model information
pub model_info: Option<ModelInfoType>,
/// Tokenizer configuration
pub tokenizer: Option<TokenizerKind>,
/// Prompt Formatter configuration
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prompt_formatter: Option<PromptFormatterArtifact>,
/// chat template may be stored as a separate file instead of in `prompt_formatter`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub chat_template_file: Option<PromptFormatterArtifact>,
/// Generation config - default sampling params
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gen_config: Option<GenerationConfig>,
/// Prompt Formatter Config
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prompt_context: Option<Vec<PromptContextMixin>>,
/// When this card was last advertised by a worker. None if not yet published.
pub last_published: Option<chrono::DateTime<chrono::Utc>>,
/// Incrementing count of how many times we published this card
#[serde(default, skip_serializing)]
pub revision: u64,
/// Max context (in number of tokens) this model can handle
pub context_length: u32,
/// Size of a KV cache block - vllm only currently
/// Passed to the engine and the KV router.
pub kv_cache_block_size: u32,
/// How many times a request can be migrated to another worker if the HTTP server lost
/// connection to the current worker.
pub migration_limit: u32,
/// User-defined metadata for custom worker behavior
#[serde(default, skip_serializing_if = "Option::is_none")]
pub user_data: Option<serde_json::Value>,
}
impl ModelDeploymentCard {
pub fn builder() -> ModelDeploymentCardBuilder {
ModelDeploymentCardBuilder::default()
}
/// Create a ModelDeploymentCard where only the name is filled in.
///
/// Single-process setups don't need an MDC to communicate model details, but it
/// simplifies the code to assume we always have one. This is how you get one in those
/// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
pub fn with_name_only(name: &str) -> ModelDeploymentCard {
ModelDeploymentCard {
display_name: name.to_string(),
slug: Slug::from_string(name),
..Default::default()
}
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub fn expiry_check_period() -> Duration {
match CARD_MAX_AGE.to_std() {
Ok(duration) => duration / 3,
Err(_) => {
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!("Cannot run card expiry watcher, invalid CARD_MAX_AGE");
}
}
}
/// Load a model deployment card from a JSON file
pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
Ok(serde_json::from_str(&std::fs::read_to_string(file)?)?)
}
/// Load a model deployment card from a JSON string
pub fn load_from_json_str(json: &str) -> Result<Self, anyhow::Error> {
Ok(serde_json::from_str(json)?)
}
//
// Methods
//
/// Save the model deployment card to a JSON file
pub fn save_to_json_file(&self, file: &str) -> Result<(), anyhow::Error> {
std::fs::write(file, self.to_json()?)?;
Ok(())
}
pub fn slug(&self) -> &Slug {
&self.slug
}
/// Serialize the model deployment card to a JSON string
pub fn to_json(&self) -> Result<String, anyhow::Error> {
Ok(serde_json::to_string(self)?)
}
pub fn mdcsum(&self) -> String {
let json = self.to_json().unwrap();
format!("{}", blake3::hash(json.as_bytes()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub fn is_expired(&self) -> bool {
if let Some(last_published) = self.last_published.as_ref() {
chrono::Utc::now() - last_published > CARD_MAX_AGE
} else {
false
}
}
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub fn has_tokenizer(&self) -> bool {
self.tokenizer.is_some()
}
pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
match &self.tokenizer {
Some(TokenizerKind::HfTokenizerJson(file)) => {
HfTokenizer::from_file(file).map_err(anyhow::Error::msg)
}
Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
None => {
anyhow::bail!("Blank ModelDeploymentCard does not have a tokenizer");
}
}
}
pub fn is_gguf(&self) -> bool {
match &self.model_info {
Some(info) => info.is_gguf(),
None => false,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug().clone();
tracing::debug!(
nats_addr,
%bucket_name,
"Uploading model deployment card fields to NATS"
);
macro_rules! nats_upload {
($field:expr, $enum_variant:path, $filename:literal) => {
if let Some($enum_variant(src_file)) = $field.take() {
if !nats::is_nats_url(&src_file) {
let target = format!("nats://{nats_addr}/{bucket_name}/{}", $filename);
nats_client
.object_store_upload(
&std::path::PathBuf::from(&src_file),
url::Url::parse(&target)?,
)
.await?;
$field = Some($enum_variant(target));
}
}
};
}
nats_upload!(self.model_info, ModelInfoType::HfConfigJson, "config.json");
nats_upload!(
self.prompt_formatter,
PromptFormatterArtifact::HfTokenizerConfigJson,
"tokenizer_config.json"
);
nats_upload!(
self.chat_template_file,
PromptFormatterArtifact::HfChatTemplate,
"chat_template.jinja"
);
nats_upload!(
self.tokenizer,
TokenizerKind::HfTokenizerJson,
"tokenizer.json"
);
nats_upload!(
self.gen_config,
GenerationConfig::HfGenerationConfigJson,
"generation_config.json"
);
Ok(())
}
/// Move the files this MDC uses from the NATS object store to local disk.
/// Updates the URI's to point to the created files.
///
/// The returned TempDir must be kept alive, it cleans up on drop.
pub async fn move_from_nats(&mut self, nats_client: nats::Client) -> Result<tempfile::TempDir> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug();
let target_dir = tempfile::TempDir::with_prefix(bucket_name.to_string())?;
tracing::debug!(
nats_addr,
%bucket_name,
target_dir = %target_dir.path().display(),
"Downloading model deployment card fields from NATS"
);
macro_rules! nats_download {
($field:expr, $enum_variant:path, $filename:literal) => {
if let Some($enum_variant(src_url)) = $field.take() {
if nats::is_nats_url(&src_url) {
let target = target_dir.path().join($filename);
nats_client
.object_store_download(Url::parse(&src_url)?, &target)
.await?;
$field = Some($enum_variant(target.display().to_string()));
}
}
};
}
nats_download!(self.model_info, ModelInfoType::HfConfigJson, "config.json");
nats_download!(
self.prompt_formatter,
PromptFormatterArtifact::HfTokenizerConfigJson,
"tokenizer_config.json"
);
nats_download!(
self.chat_template_file,
PromptFormatterArtifact::HfChatTemplate,
"chat_template.jinja"
);
nats_download!(
self.tokenizer,
TokenizerKind::HfTokenizerJson,
"tokenizer.json"
);
nats_download!(
self.gen_config,
GenerationConfig::HfGenerationConfigJson,
"generation_config.json"
);
Ok(target_dir)
}
/// Delete this card from the key-value store and it's URLs from the object store
pub async fn delete_from_nats(&mut self, nats_client: nats::Client) -> Result<()> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug();
tracing::trace!(
nats_addr,
%bucket_name,
"Delete model deployment card from NATS"
);
nats_client
.object_store_delete_bucket(bucket_name.as_ref())
.await
}
/// Allow user to override the name we register this model under.
/// Corresponds to vllm's `--served-model-name`.
pub fn set_name(&mut self, name: &str) {
self.display_name = name.to_string();
self.slug = Slug::from_string(name);
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
pub async fn load(config_path: impl AsRef<Path>) -> anyhow::Result<ModelDeploymentCard> {
let config_path = config_path.as_ref();
if config_path.is_dir() {
Self::from_local_path(config_path).await
} else {
Self::from_gguf(config_path).await
}
}
/// Creates a ModelDeploymentCard from a local directory path.
///
/// Currently HuggingFace format is supported and following files are expected:
/// - config.json: Model configuration in HuggingFace format
/// - tokenizer.json: Tokenizer configuration in HuggingFace format
/// - tokenizer_config.json: Optional prompt formatter configuration
///
/// # Arguments
/// * `local_root_dir` - Path to the local model directory
///
/// # Errors
/// Returns an error if:
/// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid
async fn from_local_path(local_root_dir: impl AsRef<Path>) -> anyhow::Result<Self> {
let local_root_dir = local_root_dir.as_ref();
check_valid_local_repo_path(local_root_dir)?;
let repo_id = local_root_dir
.canonicalize()?
.to_str()
.ok_or_else(|| anyhow::anyhow!("Path contains invalid Unicode"))?
.to_string();
let model_name = local_root_dir
.file_name()
.and_then(|n| n.to_str())
.ok_or_else(|| anyhow::anyhow!("Invalid model directory name"))?;
Self::from_repo(&repo_id, model_name).await
}
async fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let model_name = gguf_file
.iter()
.next_back()
.map(|n| n.to_string_lossy().to_string());
let Some(model_name) = model_name else {
// I think this would only happy on an empty path
anyhow::bail!(
"Could not extract model name from path '{}'",
gguf_file.display()
);
};
// TODO: we do this in HFConfig also, unify
let content = load_gguf(gguf_file)?;
let context_length = content.get_metadata()[&format!("{}.context_length", content.arch())]
.to_u32()
.unwrap_or(0);
tracing::debug!(context_length, "Loaded context length from GGUF");
Ok(Self {
display_name: model_name.to_string(),
slug: Slug::from_string(model_name),
model_info: Some(ModelInfoType::GGUF(gguf_file.to_path_buf())),
tokenizer: Some(TokenizerKind::from_gguf(gguf_file)?),
gen_config: None, // AFAICT there is no equivalent in a GGUF
prompt_formatter: Some(PromptFormatterArtifact::GGUF(gguf_file.to_path_buf())),
chat_template_file: None,
prompt_context: None, // TODO - auto-detect prompt context
revision: 0,
last_published: None,
context_length,
kv_cache_block_size: 0,
migration_limit: 0,
user_data: None,
})
}
#[allow(dead_code)]
async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> {
Err(anyhow::anyhow!(
"ModelDeploymentCard::from_ngc_repo is not implemented"
))
}
async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
// This is usually the right choice
let context_length = crate::file_json_field(
&PathBuf::from(repo_id).join("config.json"),
"max_position_embeddings",
)
// But sometimes this is
.or_else(|_| {
crate::file_json_field(
&PathBuf::from(repo_id).join("tokenizer_config.json"),
"model_max_length",
)
})
// If neither of those are present let the engine default it
.unwrap_or(0);
Ok(Self {
display_name: model_name.to_string(),
slug: Slug::from_string(model_name),
model_info: Some(ModelInfoType::from_repo(repo_id).await?),
tokenizer: Some(TokenizerKind::from_repo(repo_id).await?),
gen_config: GenerationConfig::from_repo(repo_id).await.ok(), // optional
prompt_formatter: PromptFormatterArtifact::from_repo(repo_id).await?,
chat_template_file: PromptFormatterArtifact::chat_template_from_repo(repo_id).await?,
prompt_context: None, // TODO - auto-detect prompt context
revision: 0,
last_published: None,
context_length,
kv_cache_block_size: 0, // set later
migration_limit: 0,
user_data: None,
})
}
}
impl Versioned for ModelDeploymentCard {
fn revision(&self) -> u64 {
self.revision
}
fn set_revision(&mut self, revision: u64) {
self.last_published = Some(chrono::Utc::now());
self.revision = revision;
}
}
impl fmt::Display for ModelDeploymentCard {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.slug())
}
}
pub trait ModelInfo: Send + Sync {
/// Model type
fn model_type(&self) -> String;
/// Token ID for the beginning of sequence
fn bos_token_id(&self) -> TokenIdType;
/// Token ID for the end of sequence
fn eos_token_ids(&self) -> Vec<TokenIdType>;
/// Maximum position embeddings / max sequence length
/// TODO: This is only used in a single test, no other code. Remove?
fn max_position_embeddings(&self) -> Option<usize>;
/// Vocabulary size
/// TODO: This is only used in a single test, no other code. Remove?
fn vocab_size(&self) -> Option<usize>;
}
impl ModelInfoType {
pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> {
match self {
Self::HfConfigJson(info) => HFConfig::from_json_file(info).await,
Self::GGUF(path) => HFConfig::from_gguf(path),
}
}
pub fn is_gguf(&self) -> bool {
matches!(self, Self::GGUF(_))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct HFConfig {
/// denotes the mixin to the flattened data model which can be present
/// in the config.json file
architectures: Vec<String>,
/// general model type
model_type: String,
text_config: Option<HFTextConfig>,
// Sometimes it's inside HFTextConfig, sometimes it's here
eos_token_id: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct HFTextConfig {
// It can take multiple attempts to load this, so Option
bos_token_id: Option<TokenIdType>,
// We set this once bos_token_id is loaded so we don't have to deal with Option
#[serde(default)]
final_bos_token_id: TokenIdType,
eos_token_id: Option<serde_json::Value>,
#[serde(default)]
final_eos_token_ids: Vec<TokenIdType>,
/// max sequence length
max_position_embeddings: Option<usize>,
/// number of layers in the model
num_hidden_layers: usize,
/// number of attention heads in the model
num_attention_heads: Option<usize>,
/// Vocabulary size
vocab_size: Option<usize>,
}
impl HFConfig {
async fn from_json_file(file: &str) -> Result<Arc<dyn ModelInfo>> {
let file_pathbuf = PathBuf::from(file);
let contents = std::fs::read_to_string(file)?;
let mut config: Self = serde_json::from_str(&contents)?;
if config.text_config.is_none() {
let text_config: HFTextConfig = serde_json::from_str(&contents)?;
config.text_config = Some(text_config);
}
// Sometimes bos_token_id is in generation_config.json not config.json
let Some(text_config) = config.text_config.as_mut() else {
anyhow::bail!(
"Missing text config fields (model_type, eos_token_ids, etc) in config.json"
);
};
if text_config.bos_token_id.is_none() {
let bos_token_id = crate::file_json_field::<TokenIdType>(
&Path::join(
file_pathbuf.parent().unwrap_or(&PathBuf::from("")),
"generation_config.json",
),
"bos_token_id",
)
.context(
"missing bos_token_id in generation_config.json and config.json, cannot load",
)?;
text_config.bos_token_id = Some(bos_token_id);
}
// Now that we have it for sure, set it in the non-Option field
let final_bos_token_id = text_config.bos_token_id.take().unwrap();
text_config.final_bos_token_id = final_bos_token_id;
// TODO: refactor this when we switch to per-architecture tokenization
let final_eos_token_ids: Vec<TokenIdType> = config
.eos_token_id
.as_ref()
.or(text_config.eos_token_id.as_ref())
.and_then(|v| {
if v.is_number() {
v.as_number()
.and_then(|n| n.as_u64())
.map(|n| vec![n as TokenIdType])
} else if v.is_array() {
let arr = v.as_array().unwrap(); // Safety: We just checked
Some(
arr.iter()
.filter_map(|inner_v| {
inner_v
.as_number()
.and_then(|n| n.as_u64())
.map(|n| n as TokenIdType)
})
.collect(),
)
} else {
tracing::error!(
?v,
file,
"eos_token_id is not a number or an array, cannot use"
);
None
}
})
.or_else(|| {
// Maybe it's in generation_config.json
crate::file_json_field(
&Path::join(
file_pathbuf.parent().unwrap_or(&PathBuf::from("")),
"generation_config.json",
),
"eos_token_id",
)
.inspect_err(
|err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"),
)
.ok()
})
.ok_or_else(|| {
anyhow::anyhow!(
"missing eos_token_id in config.json and generation_config.json, cannot load"
)
})?;
text_config.final_eos_token_ids = final_eos_token_ids;
Ok(Arc::new(config))
}
fn from_gguf(gguf_file: &Path) -> Result<Arc<dyn ModelInfo>> {
let content = load_gguf(gguf_file)?;
let model_config_metadata: ContentConfig = (&content).into();
let num_hidden_layers =
content.get_metadata()[&format!("{}.block_count", content.arch())].to_u32()? as usize;
let bos_token_id = content.get_metadata()["tokenizer.ggml.bos_token_id"].to_u32()?;
let eos_token_id = content.get_metadata()["tokenizer.ggml.eos_token_id"].to_u32()?;
// to_vec returns a Vec that's already there, so it's cheap
let vocab_size = content.get_metadata()["tokenizer.ggml.tokens"]
.to_vec()?
.len();
let arch = content.arch().to_string();
Ok(Arc::new(HFConfig {
architectures: vec![format!("{}ForCausalLM", capitalize(&arch))],
// "general.architecture"
model_type: arch,
text_config: Some(HFTextConfig {
bos_token_id: None,
final_bos_token_id: bos_token_id,
eos_token_id: None,
final_eos_token_ids: vec![eos_token_id],
// "llama.context_length"
max_position_embeddings: Some(model_config_metadata.max_seq_len()),
// "llama.block_count"
num_hidden_layers,
// "llama.attention.head_count"
num_attention_heads: Some(model_config_metadata.num_attn_heads()),
// "tokenizer.ggml.tokens".len()
vocab_size: Some(vocab_size),
}),
eos_token_id: None,
}))
}
}
impl ModelInfo for HFConfig {
fn model_type(&self) -> String {
self.model_type.clone()
}
fn bos_token_id(&self) -> TokenIdType {
self.text_config.as_ref().unwrap().final_bos_token_id
}
fn eos_token_ids(&self) -> Vec<TokenIdType> {
self.text_config
.as_ref()
.unwrap()
.final_eos_token_ids
.clone()
}
fn max_position_embeddings(&self) -> Option<usize> {
self.text_config.as_ref().unwrap().max_position_embeddings
}
fn vocab_size(&self) -> Option<usize> {
self.text_config.as_ref().unwrap().vocab_size
}
}
impl TokenizerKind {
pub fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let content = load_gguf(gguf_file)?;
let out = crate::gguf::convert_gguf_to_hf_tokenizer(&content)
.with_context(|| gguf_file.display().to_string())?;
Ok(TokenizerKind::GGUF(Box::new(out.tokenizer)))
}
}
pub(crate) fn load_gguf(gguf_file: &Path) -> anyhow::Result<Content> {
let filename = gguf_file.display().to_string();
let mut f = File::open(gguf_file).with_context(|| filename.clone())?;
// vec because GGUF can be split into multiple files (shards)
let mut readers = vec![&mut f];
crate::gguf::Content::from_readers(&mut readers).with_context(|| filename.clone())
}
fn capitalize(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(first) => first.to_uppercase().collect::<String>() + &chars.as_str().to_lowercase(),
}
}
impl ModelInfoType {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract model info from repo {}", repo_id))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfConfigJson(
check_for_file(repo, "config.json").await?,
))
}
}
impl PromptFormatterArtifact {
pub async fn from_repo(repo_id: &str) -> Result<Option<Self>> {
// we should only error if we expect a prompt formatter and it's not found
// right now, we don't know when to expect it, so we just return Ok(Some/None)
Ok(Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract prompt format from repo {}", repo_id))
.ok())
}
pub async fn chat_template_from_repo(repo_id: &str) -> Result<Option<Self>> {
Ok(Self::chat_template_try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract prompt format from repo {}", repo_id))
.ok())
}
async fn chat_template_try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfChatTemplate(
check_for_file(repo, "chat_template.jinja").await?,
))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfTokenizerConfigJson(
check_for_file(repo, "tokenizer_config.json").await?,
))
}
}
impl TokenizerKind {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract tokenizer kind from repo {}", repo_id))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfTokenizerJson(
check_for_file(repo, "tokenizer.json").await?,
))
}
}
impl GenerationConfig {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract generation config from repo {repo_id}"))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfGenerationConfigJson(
check_for_file(repo, "generation_config.json").await?,
))
}
}
/// Checks if the provided path contains the expected file.
async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<String> {
let p = PathBuf::from(repo_id).join(file);
let name = p.display().to_string();
if !p.exists() {
anyhow::bail!("File not found: {name}")
}
Ok(name)
}
/// Checks if the provided path is a valid local repository path.
///
/// # Arguments
/// * `path` - Path to validate
///
/// # Errors
/// Returns an error if the path doesn't exist or isn't a directory
fn check_valid_local_repo_path(path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
if !path.exists() {
return Err(anyhow::anyhow!(
"Model path does not exist: {}",
path.display()
));
}
if !path.is_dir() {
return Err(anyhow::anyhow!(
"Model path is not a directory: {}",
path.display()
));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::HFConfig;
use std::path::Path;
#[tokio::test]
pub async fn test_config_json_llama3() -> anyhow::Result<()> {
let config_file = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json");
let config = HFConfig::from_json_file(&config_file.display().to_string()).await?;
assert_eq!(config.bos_token_id(), 128000);
Ok(())
}
#[tokio::test]
pub async fn test_config_json_llama4() -> anyhow::Result<()> {
let config_file = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests/data/sample-models/Llama-4-Scout-17B-16E-Instruct/config.json");
let config = HFConfig::from_json_file(&config_file.display().to_string()).await?;
assert_eq!(config.bos_token_id(), 200000);
Ok(())
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use crate::model_card::model::ModelDeploymentCard;
use anyhow::{Context, Result};
use std::path::{Path, PathBuf};
use crate::model_card::model::{ModelInfoType, PromptFormatterArtifact, TokenizerKind};
use super::model::GenerationConfig;
impl ModelDeploymentCard {
/// Allow user to override the name we register this model under.
/// Corresponds to vllm's `--served-model-name`.
pub fn set_name(&mut self, name: &str) {
self.display_name = name.to_string();
self.service_name = name.to_string();
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
pub async fn load(config_path: impl AsRef<Path>) -> anyhow::Result<ModelDeploymentCard> {
let config_path = config_path.as_ref();
if config_path.is_dir() {
Self::from_local_path(config_path).await
} else {
Self::from_gguf(config_path).await
}
}
/// Creates a ModelDeploymentCard from a local directory path.
///
/// Currently HuggingFace format is supported and following files are expected:
/// - config.json: Model configuration in HuggingFace format
/// - tokenizer.json: Tokenizer configuration in HuggingFace format
/// - tokenizer_config.json: Optional prompt formatter configuration
///
/// # Arguments
/// * `local_root_dir` - Path to the local model directory
///
/// # Errors
/// Returns an error if:
/// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid
async fn from_local_path(local_root_dir: impl AsRef<Path>) -> anyhow::Result<Self> {
let local_root_dir = local_root_dir.as_ref();
check_valid_local_repo_path(local_root_dir)?;
let repo_id = local_root_dir
.canonicalize()?
.to_str()
.ok_or_else(|| anyhow::anyhow!("Path contains invalid Unicode"))?
.to_string();
let model_name = local_root_dir
.file_name()
.and_then(|n| n.to_str())
.ok_or_else(|| anyhow::anyhow!("Invalid model directory name"))?;
Self::from_repo(&repo_id, model_name).await
}
async fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let model_name = gguf_file
.iter()
.next_back()
.map(|n| n.to_string_lossy().to_string());
let Some(model_name) = model_name else {
// I think this would only happy on an empty path
anyhow::bail!(
"Could not extract model name from path '{}'",
gguf_file.display()
);
};
// TODO: we do this in HFConfig also, unify
let content = super::model::load_gguf(gguf_file)?;
let context_length = content.get_metadata()[&format!("{}.context_length", content.arch())]
.to_u32()
.unwrap_or(0);
tracing::debug!(context_length, "Loaded context length from GGUF");
Ok(Self {
display_name: model_name.to_string(),
service_name: model_name.to_string(),
model_info: Some(ModelInfoType::GGUF(gguf_file.to_path_buf())),
tokenizer: Some(TokenizerKind::from_gguf(gguf_file)?),
gen_config: None, // AFAICT there is no equivalent in a GGUF
prompt_formatter: Some(PromptFormatterArtifact::GGUF(gguf_file.to_path_buf())),
chat_template_file: None,
prompt_context: None, // TODO - auto-detect prompt context
revision: 0,
last_published: None,
context_length,
kv_cache_block_size: 0,
migration_limit: 0,
user_data: None,
})
}
#[allow(dead_code)]
async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> {
Err(anyhow::anyhow!(
"ModelDeploymentCard::from_ngc_repo is not implemented"
))
}
async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
// This is usually the right choice
let context_length = crate::file_json_field(
&PathBuf::from(repo_id).join("config.json"),
"max_position_embeddings",
)
// But sometimes this is
.or_else(|_| {
crate::file_json_field(
&PathBuf::from(repo_id).join("tokenizer_config.json"),
"model_max_length",
)
})
// If neither of those are present let the engine default it
.unwrap_or(0);
Ok(Self {
display_name: model_name.to_string(),
service_name: model_name.to_string(),
model_info: Some(ModelInfoType::from_repo(repo_id).await?),
tokenizer: Some(TokenizerKind::from_repo(repo_id).await?),
gen_config: GenerationConfig::from_repo(repo_id).await.ok(), // optional
prompt_formatter: PromptFormatterArtifact::from_repo(repo_id).await?,
chat_template_file: PromptFormatterArtifact::chat_template_from_repo(repo_id).await?,
prompt_context: None, // TODO - auto-detect prompt context
revision: 0,
last_published: None,
context_length,
kv_cache_block_size: 0, // set later
migration_limit: 0,
user_data: None,
})
}
}
impl ModelInfoType {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract model info from repo {}", repo_id))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfConfigJson(
check_for_file(repo, "config.json").await?,
))
}
}
impl PromptFormatterArtifact {
pub async fn from_repo(repo_id: &str) -> Result<Option<Self>> {
// we should only error if we expect a prompt formatter and it's not found
// right now, we don't know when to expect it, so we just return Ok(Some/None)
Ok(Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract prompt format from repo {}", repo_id))
.ok())
}
pub async fn chat_template_from_repo(repo_id: &str) -> Result<Option<Self>> {
Ok(Self::chat_template_try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract prompt format from repo {}", repo_id))
.ok())
}
async fn chat_template_try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfChatTemplate(
check_for_file(repo, "chat_template.jinja").await?,
))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfTokenizerConfigJson(
check_for_file(repo, "tokenizer_config.json").await?,
))
}
}
impl TokenizerKind {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract tokenizer kind from repo {}", repo_id))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfTokenizerJson(
check_for_file(repo, "tokenizer.json").await?,
))
}
}
impl GenerationConfig {
pub async fn from_repo(repo_id: &str) -> Result<Self> {
Self::try_is_hf_repo(repo_id)
.await
.with_context(|| format!("unable to extract generation config from repo {repo_id}"))
}
async fn try_is_hf_repo(repo: &str) -> anyhow::Result<Self> {
Ok(Self::HfGenerationConfigJson(
check_for_file(repo, "generation_config.json").await?,
))
}
}
/// Checks if the provided path contains the expected file.
async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<String> {
let p = PathBuf::from(repo_id).join(file);
let name = p.display().to_string();
if !p.exists() {
anyhow::bail!("File not found: {name}")
}
Ok(name)
}
/// Checks if the provided path is a valid local repository path.
///
/// # Arguments
/// * `path` - Path to validate
///
/// # Errors
/// Returns an error if the path doesn't exist or isn't a directory
fn check_valid_local_repo_path(path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
if !path.exists() {
return Err(anyhow::anyhow!(
"Model path does not exist: {}",
path.display()
));
}
if !path.is_dir() {
return Err(anyhow::anyhow!(
"Model path is not a directory: {}",
path.display()
));
}
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! # Model Deployment Card
//!
//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
//! component that needs to interact with the model or its dependent artifacts.
//!
//! The ModelDeploymentCard contains LLM model deployment configuration information:
//! - Display name and service name for the model
//! - Model information (ModelInfoType)
//! - Tokenizer configuration (TokenizerKind)
//! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc.
use std::fmt;
use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, Result};
use derive_builder::Builder;
use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports::nats};
use serde::{Deserialize, Serialize};
use tokenizers::Tokenizer as HfTokenizer;
use url::Url;
use crate::gguf::{Content, ContentConfig, ModelConfigLike};
use crate::protocols::TokenIdType;
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5);
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum ModelInfoType {
HfConfigJson(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum TokenizerKind {
HfTokenizerJson(String),
GGUF(Box<HfTokenizer>),
}
/// Supported types of prompt formatters.
///
/// We need a way to associate the prompt formatter template definition with an associated
/// data model which is expected for rendering.
///
/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
/// format. However, we currently do not have a discovery path to know if the model supports tool use
/// unless we inspect the template.
///
/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
/// - OaiChat
/// - OaiChatToolUse
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum PromptFormatterArtifact {
HfTokenizerConfigJson(String),
HfChatTemplate(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case")]
pub enum PromptContextMixin {
/// Support OAI Chat Messages and Tools
OaiChat,
/// Enables templates with `{{datetime}}` to be rendered with the current date and time.
Llama3DateTime,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")]
pub enum GenerationConfig {
HfGenerationConfigJson(String),
GGUF(PathBuf),
}
#[derive(Serialize, Deserialize, Clone, Debug, Builder, Default)]
pub struct ModelDeploymentCard {
/// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
pub display_name: String,
/// Identifier to expect in OpenAI compatible HTTP request, e.g. "meta-llama/Meta-Llama-3.1-8B-Instruct"
/// This will get slugified for use in NATS.
pub service_name: String,
/// Model information
pub model_info: Option<ModelInfoType>,
/// Tokenizer configuration
pub tokenizer: Option<TokenizerKind>,
/// Prompt Formatter configuration
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prompt_formatter: Option<PromptFormatterArtifact>,
/// chat template may be stored as a separate file instead of in `prompt_formatter`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub chat_template_file: Option<PromptFormatterArtifact>,
/// Generation config - default sampling params
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gen_config: Option<GenerationConfig>,
/// Prompt Formatter Config
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prompt_context: Option<Vec<PromptContextMixin>>,
/// When this card was last advertised by a worker. None if not yet published.
pub last_published: Option<chrono::DateTime<chrono::Utc>>,
/// Incrementing count of how many times we published this card
#[serde(default, skip_serializing)]
pub revision: u64,
/// Max context (in number of tokens) this model can handle
pub context_length: u32,
/// Size of a KV cache block - vllm only currently
/// Passed to the engine and the KV router.
pub kv_cache_block_size: u32,
/// How many times a request can be migrated to another worker if the HTTP server lost
/// connection to the current worker.
pub migration_limit: u32,
/// User-defined metadata for custom worker behavior
#[serde(default, skip_serializing_if = "Option::is_none")]
pub user_data: Option<serde_json::Value>,
}
impl ModelDeploymentCard {
pub fn builder() -> ModelDeploymentCardBuilder {
ModelDeploymentCardBuilder::default()
}
/// Create a ModelDeploymentCard where only the name is filled in.
///
/// Single-process setups don't need an MDC to communicate model details, but it
/// simplifies the code to assume we always have one. This is how you get one in those
/// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
pub fn with_name_only(name: &str) -> ModelDeploymentCard {
ModelDeploymentCard {
display_name: name.to_string(),
service_name: Slug::slugify(name).to_string(),
..Default::default()
}
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub fn expiry_check_period() -> Duration {
match CARD_MAX_AGE.to_std() {
Ok(duration) => duration / 3,
Err(_) => {
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!("Cannot run card expiry watcher, invalid CARD_MAX_AGE");
}
}
}
/// Load a model deployment card from a JSON file
pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
Ok(serde_json::from_str(&std::fs::read_to_string(file)?)?)
}
/// Load a model deployment card from a JSON string
pub fn load_from_json_str(json: &str) -> Result<Self, anyhow::Error> {
Ok(serde_json::from_str(json)?)
}
//
// Methods
//
/// Save the model deployment card to a JSON file
pub fn save_to_json_file(&self, file: &str) -> Result<(), anyhow::Error> {
std::fs::write(file, self.to_json()?)?;
Ok(())
}
pub fn set_service_name(&mut self, service_name: &str) {
self.service_name = service_name.to_string();
}
pub fn slug(&self) -> Slug {
Slug::from_string(&self.display_name)
}
/// Serialize the model deployment card to a JSON string
pub fn to_json(&self) -> Result<String, anyhow::Error> {
Ok(serde_json::to_string(self)?)
}
pub fn mdcsum(&self) -> String {
let json = self.to_json().unwrap();
format!("{}", blake3::hash(json.as_bytes()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub fn is_expired(&self) -> bool {
if let Some(last_published) = self.last_published.as_ref() {
chrono::Utc::now() - last_published > CARD_MAX_AGE
} else {
false
}
}
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub fn has_tokenizer(&self) -> bool {
self.tokenizer.is_some()
}
pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
match &self.tokenizer {
Some(TokenizerKind::HfTokenizerJson(file)) => {
HfTokenizer::from_file(file).map_err(anyhow::Error::msg)
}
Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
None => {
anyhow::bail!("Blank ModelDeploymentCard does not have a tokenizer");
}
}
}
pub fn is_gguf(&self) -> bool {
match &self.model_info {
Some(info) => info.is_gguf(),
None => false,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug();
tracing::debug!(
nats_addr,
%bucket_name,
"Uploading model deployment card fields to NATS"
);
macro_rules! nats_upload {
($field:expr, $enum_variant:path, $filename:literal) => {
if let Some($enum_variant(src_file)) = $field.take() {
if !nats::is_nats_url(&src_file) {
let target = format!("nats://{nats_addr}/{bucket_name}/{}", $filename);
nats_client
.object_store_upload(
&std::path::PathBuf::from(&src_file),
url::Url::parse(&target)?,
)
.await?;
$field = Some($enum_variant(target));
}
}
};
}
nats_upload!(self.model_info, ModelInfoType::HfConfigJson, "config.json");
nats_upload!(
self.prompt_formatter,
PromptFormatterArtifact::HfTokenizerConfigJson,
"tokenizer_config.json"
);
nats_upload!(
self.chat_template_file,
PromptFormatterArtifact::HfChatTemplate,
"chat_template.jinja"
);
nats_upload!(
self.tokenizer,
TokenizerKind::HfTokenizerJson,
"tokenizer.json"
);
nats_upload!(
self.gen_config,
GenerationConfig::HfGenerationConfigJson,
"generation_config.json"
);
Ok(())
}
/// Move the files this MDC uses from the NATS object store to local disk.
/// Updates the URI's to point to the created files.
///
/// The returned TempDir must be kept alive, it cleans up on drop.
pub async fn move_from_nats(&mut self, nats_client: nats::Client) -> Result<tempfile::TempDir> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug();
let target_dir = tempfile::TempDir::with_prefix(bucket_name.to_string())?;
tracing::debug!(
nats_addr,
%bucket_name,
target_dir = %target_dir.path().display(),
"Downloading model deployment card fields from NATS"
);
macro_rules! nats_download {
($field:expr, $enum_variant:path, $filename:literal) => {
if let Some($enum_variant(src_url)) = $field.take() {
if nats::is_nats_url(&src_url) {
let target = target_dir.path().join($filename);
nats_client
.object_store_download(Url::parse(&src_url)?, &target)
.await?;
$field = Some($enum_variant(target.display().to_string()));
}
}
};
}
nats_download!(self.model_info, ModelInfoType::HfConfigJson, "config.json");
nats_download!(
self.prompt_formatter,
PromptFormatterArtifact::HfTokenizerConfigJson,
"tokenizer_config.json"
);
nats_download!(
self.chat_template_file,
PromptFormatterArtifact::HfChatTemplate,
"chat_template.jinja"
);
nats_download!(
self.tokenizer,
TokenizerKind::HfTokenizerJson,
"tokenizer.json"
);
nats_download!(
self.gen_config,
GenerationConfig::HfGenerationConfigJson,
"generation_config.json"
);
Ok(target_dir)
}
/// Delete this card from the key-value store and it's URLs from the object store
pub async fn delete_from_nats(&mut self, nats_client: nats::Client) -> Result<()> {
let nats_addr = nats_client.addr();
let bucket_name = self.slug();
tracing::trace!(
nats_addr,
%bucket_name,
"Delete model deployment card from NATS"
);
nats_client
.object_store_delete_bucket(bucket_name.as_ref())
.await
}
}
impl Versioned for ModelDeploymentCard {
fn revision(&self) -> u64 {
self.revision
}
fn set_revision(&mut self, revision: u64) {
self.last_published = Some(chrono::Utc::now());
self.revision = revision;
}
}
impl fmt::Display for ModelDeploymentCard {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.slug())
}
}
pub trait ModelInfo: Send + Sync {
/// Model type
fn model_type(&self) -> String;
/// Token ID for the beginning of sequence
fn bos_token_id(&self) -> TokenIdType;
/// Token ID for the end of sequence
fn eos_token_ids(&self) -> Vec<TokenIdType>;
/// Maximum position embeddings / max sequence length
/// TODO: This is only used in a single test, no other code. Remove?
fn max_position_embeddings(&self) -> Option<usize>;
/// Vocabulary size
/// TODO: This is only used in a single test, no other code. Remove?
fn vocab_size(&self) -> Option<usize>;
}
impl ModelInfoType {
pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> {
match self {
Self::HfConfigJson(info) => HFConfig::from_json_file(info).await,
Self::GGUF(path) => HFConfig::from_gguf(path),
}
}
pub fn is_gguf(&self) -> bool {
matches!(self, Self::GGUF(_))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct HFConfig {
/// denotes the mixin to the flattened data model which can be present
/// in the config.json file
architectures: Vec<String>,
/// general model type
model_type: String,
text_config: Option<HFTextConfig>,
// Sometimes it's inside HFTextConfig, sometimes it's here
eos_token_id: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct HFTextConfig {
// It can take multiple attempts to load this, so Option
bos_token_id: Option<TokenIdType>,
// We set this once bos_token_id is loaded so we don't have to deal with Option
#[serde(default)]
final_bos_token_id: TokenIdType,
eos_token_id: Option<serde_json::Value>,
#[serde(default)]
final_eos_token_ids: Vec<TokenIdType>,
/// max sequence length
max_position_embeddings: Option<usize>,
/// number of layers in the model
num_hidden_layers: usize,
/// number of attention heads in the model
num_attention_heads: Option<usize>,
/// Vocabulary size
vocab_size: Option<usize>,
}
impl HFConfig {
async fn from_json_file(file: &str) -> Result<Arc<dyn ModelInfo>> {
let file_pathbuf = PathBuf::from(file);
let contents = std::fs::read_to_string(file)?;
let mut config: Self = serde_json::from_str(&contents)?;
if config.text_config.is_none() {
let text_config: HFTextConfig = serde_json::from_str(&contents)?;
config.text_config = Some(text_config);
}
// Sometimes bos_token_id is in generation_config.json not config.json
let Some(text_config) = config.text_config.as_mut() else {
anyhow::bail!(
"Missing text config fields (model_type, eos_token_ids, etc) in config.json"
);
};
if text_config.bos_token_id.is_none() {
let bos_token_id = crate::file_json_field::<TokenIdType>(
&Path::join(
file_pathbuf.parent().unwrap_or(&PathBuf::from("")),
"generation_config.json",
),
"bos_token_id",
)
.context(
"missing bos_token_id in generation_config.json and config.json, cannot load",
)?;
text_config.bos_token_id = Some(bos_token_id);
}
// Now that we have it for sure, set it in the non-Option field
let final_bos_token_id = text_config.bos_token_id.take().unwrap();
text_config.final_bos_token_id = final_bos_token_id;
// TODO: refactor this when we switch to per-architecture tokenization
let final_eos_token_ids: Vec<TokenIdType> = config
.eos_token_id
.as_ref()
.or(text_config.eos_token_id.as_ref())
.and_then(|v| {
if v.is_number() {
v.as_number()
.and_then(|n| n.as_u64())
.map(|n| vec![n as TokenIdType])
} else if v.is_array() {
let arr = v.as_array().unwrap(); // Safety: We just checked
Some(
arr.iter()
.filter_map(|inner_v| {
inner_v
.as_number()
.and_then(|n| n.as_u64())
.map(|n| n as TokenIdType)
})
.collect(),
)
} else {
tracing::error!(
?v,
file,
"eos_token_id is not a number or an array, cannot use"
);
None
}
})
.or_else(|| {
// Maybe it's in generation_config.json
crate::file_json_field(
&Path::join(
file_pathbuf.parent().unwrap_or(&PathBuf::from("")),
"generation_config.json",
),
"eos_token_id",
)
.inspect_err(
|err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"),
)
.ok()
})
.ok_or_else(|| {
anyhow::anyhow!(
"missing eos_token_id in config.json and generation_config.json, cannot load"
)
})?;
text_config.final_eos_token_ids = final_eos_token_ids;
Ok(Arc::new(config))
}
fn from_gguf(gguf_file: &Path) -> Result<Arc<dyn ModelInfo>> {
let content = load_gguf(gguf_file)?;
let model_config_metadata: ContentConfig = (&content).into();
let num_hidden_layers =
content.get_metadata()[&format!("{}.block_count", content.arch())].to_u32()? as usize;
let bos_token_id = content.get_metadata()["tokenizer.ggml.bos_token_id"].to_u32()?;
let eos_token_id = content.get_metadata()["tokenizer.ggml.eos_token_id"].to_u32()?;
// to_vec returns a Vec that's already there, so it's cheap
let vocab_size = content.get_metadata()["tokenizer.ggml.tokens"]
.to_vec()?
.len();
let arch = content.arch().to_string();
Ok(Arc::new(HFConfig {
architectures: vec![format!("{}ForCausalLM", capitalize(&arch))],
// "general.architecture"
model_type: arch,
text_config: Some(HFTextConfig {
bos_token_id: None,
final_bos_token_id: bos_token_id,
eos_token_id: None,
final_eos_token_ids: vec![eos_token_id],
// "llama.context_length"
max_position_embeddings: Some(model_config_metadata.max_seq_len()),
// "llama.block_count"
num_hidden_layers,
// "llama.attention.head_count"
num_attention_heads: Some(model_config_metadata.num_attn_heads()),
// "tokenizer.ggml.tokens".len()
vocab_size: Some(vocab_size),
}),
eos_token_id: None,
}))
}
}
impl ModelInfo for HFConfig {
fn model_type(&self) -> String {
self.model_type.clone()
}
fn bos_token_id(&self) -> TokenIdType {
self.text_config.as_ref().unwrap().final_bos_token_id
}
fn eos_token_ids(&self) -> Vec<TokenIdType> {
self.text_config
.as_ref()
.unwrap()
.final_eos_token_ids
.clone()
}
fn max_position_embeddings(&self) -> Option<usize> {
self.text_config.as_ref().unwrap().max_position_embeddings
}
fn vocab_size(&self) -> Option<usize> {
self.text_config.as_ref().unwrap().vocab_size
}
}
impl TokenizerKind {
pub fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let content = load_gguf(gguf_file)?;
let out = crate::gguf::convert_gguf_to_hf_tokenizer(&content)
.with_context(|| gguf_file.display().to_string())?;
Ok(TokenizerKind::GGUF(Box::new(out.tokenizer)))
}
}
pub(crate) fn load_gguf(gguf_file: &Path) -> anyhow::Result<Content> {
let filename = gguf_file.display().to_string();
let mut f = File::open(gguf_file).with_context(|| filename.clone())?;
// vec because GGUF can be split into multiple files (shards)
let mut readers = vec![&mut f];
crate::gguf::Content::from_readers(&mut readers).with_context(|| filename.clone())
}
fn capitalize(s: &str) -> String {
s.chars()
.enumerate()
.map(|(i, c)| {
if i == 0 {
c.to_uppercase().to_string()
} else {
c.to_lowercase().to_string()
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::HFConfig;
use std::path::Path;
#[tokio::test]
pub async fn test_config_json_llama3() -> anyhow::Result<()> {
let config_file = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json");
let config = HFConfig::from_json_file(&config_file.display().to_string()).await?;
assert_eq!(config.bos_token_id(), 128000);
Ok(())
}
#[tokio::test]
pub async fn test_config_json_llama4() -> anyhow::Result<()> {
let config_file = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests/data/sample-models/Llama-4-Scout-17B-16E-Instruct/config.json");
let config = HFConfig::from_json_file(&config_file.display().to_string()).await?;
assert_eq!(config.bos_token_id(), 200000);
Ok(())
}
}
......@@ -22,7 +22,7 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::{collections::HashMap, sync::Arc};
use tracing;
use crate::model_card::model::{ModelDeploymentCard, ModelInfo, TokenizerKind};
use crate::model_card::{ModelDeploymentCard, ModelInfo, TokenizerKind};
use crate::preprocessor::prompt::OAIChatLikeRequest;
use crate::tokenizers::Encoding;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::{collections::HashSet, sync::Arc};
use anyhow::{Context, Ok, Result};
use minijinja::Environment;
use crate::model_card::model::{ModelDeploymentCard, PromptContextMixin, PromptFormatterArtifact};
use crate::model_card::{ModelDeploymentCard, PromptContextMixin, PromptFormatterArtifact};
mod context;
mod formatters;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use dynamo_llm::backend::Backend;
use dynamo_llm::model_card::model::ModelDeploymentCard;
use dynamo_llm::model_card::ModelDeploymentCard;
#[tokio::test]
async fn test_sequence_factory() {
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use dynamo_llm::model_card::model::{ModelDeploymentCard, PromptFormatterArtifact, TokenizerKind};
use dynamo_llm::model_card::{ModelDeploymentCard, PromptFormatterArtifact, TokenizerKind};
use tempfile::tempdir;
const HF_PATH: &str = "tests/data/sample-models/TinyLlama_v1.1";
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use anyhow::{Ok, Result};
use dynamo_llm::model_card::model::{ModelDeploymentCard, PromptContextMixin};
use dynamo_llm::model_card::{ModelDeploymentCard, PromptContextMixin};
use dynamo_llm::preprocessor::prompt::PromptFormatter;
use dynamo_llm::protocols::openai::chat_completions::NvCreateChatCompletionRequest;
use serde::{Deserialize, Serialize};
......
......@@ -21,7 +21,7 @@ const REPLACEMENT_CHAR: char = '_';
/// URL and NATS friendly string.
/// Only a-z, 0-9, - and _.
#[derive(Serialize, Clone, Debug, Eq, PartialEq)]
#[derive(Serialize, Clone, Debug, Eq, PartialEq, Default)]
pub struct Slug(String);
impl Slug {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment