feat: Make part of discovery re-usable (#3073)

Signed-off-by: Graham King <grahamk@nvidia.com>

feat: Make part of discovery re-usable (#3073)
Signed-off-by: Graham King <grahamk@nvidia.com>
9060ce12 · Graham King · GitHub · 045b61dd · 9060ce12 · 9060ce12
Unverified Commit 9060ce12 authored Sep 17, 2025 by Graham King Committed by GitHub Sep 17, 2025
8 changed files
--- a/lib/bindings/python/rust/llm/model_card.rs
+++ b/lib/bindings/python/rust/llm/model_card.rs
@@ -17,7 +17,7 @@ impl ModelDeploymentCard {
    // Previously called "from_local_path"
    #[staticmethod]
    fn load(path: String, model_name: String) -> PyResult<ModelDeploymentCard> {
-        let mut card = RsModelDeploymentCard::load(&path, None).map_err(to_pyerr)?;
+        let mut card = RsModelDeploymentCard::load_from_disk(&path, None).map_err(to_pyerr)?;
        card.set_name(&model_name);
        Ok(ModelDeploymentCard { inner: card })
    }

--- a/lib/llm/src/discovery/model_entry.rs
+++ b/lib/llm/src/discovery/model_entry.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

-use std::sync::Arc;
-
-use dynamo_runtime::transports::etcd;
-use dynamo_runtime::{
-    protocols,
-    slug::Slug,
-    storage::key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager},
-};
+use dynamo_runtime::{protocols, slug::Slug};
 use serde::{Deserialize, Serialize};

 use crate::{
    local_model::runtime_config::ModelRuntimeConfig,
-    model_card::{self, ModelDeploymentCard},
    model_type::{ModelInput, ModelType},
 };

-/// [ModelEntry] contains the information to discover models from the etcd cluster.
+/// [ModelEntry] contains the information to discover models
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
 pub struct ModelEntry {
    /// Public name of the model
@@ -42,7 +34,7 @@ pub struct ModelEntry {
 }

 impl ModelEntry {
-    /// Slugified display name for use in etcd and NATS
+    /// Slugified display name for use in network storage, or URL-safe environments
    pub fn slug(&self) -> Slug {
        Slug::from_string(&self.name)
    }
@@ -50,29 +42,4 @@ impl ModelEntry {
    pub fn requires_preprocessing(&self) -> bool {
        matches!(self.model_input, ModelInput::Tokens)
    }
-
-    /// Fetch the ModelDeploymentCard from etcd.
-    /// This does not touch its fields so you may need to call move_from_nats on it.
-    pub async fn load_mdc(
-        &self,
-        etcd_client: &etcd::Client,
-    ) -> anyhow::Result<ModelDeploymentCard> {
-        let kvstore: Box<dyn KeyValueStore> = Box::new(EtcdStorage::new(etcd_client.clone()));
-        let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
-        let card_key = self.slug();
-        match card_store
-            .load::<ModelDeploymentCard>(model_card::ROOT_PATH, &card_key)
-            .await
-        {
-            Ok(Some(mdc)) => Ok(mdc),
-            Ok(None) => {
-                anyhow::bail!("Missing ModelDeploymentCard in etcd under key {card_key}");
-            }
-            Err(err) => {
-                anyhow::bail!(
-                    "Error fetching ModelDeploymentCard from etcd under key {card_key}. {err}"
-                );
-            }
-        }
-    }
 }
--- a/lib/llm/src/discovery/watcher.rs
+++ b/lib/llm/src/discovery/watcher.rs
@@ -21,6 +21,7 @@ use crate::{
    backend::Backend,
    entrypoint,
    kv_router::KvRouterConfig,
+    model_card::ModelDeploymentCard,
    model_type::{ModelInput, ModelType},
    preprocessor::{OpenAIPreprocessor, PreprocessedEmbeddingRequest, prompt::PromptFormatter},
    protocols::{
@@ -260,19 +261,16 @@ impl ModelWatcher {
            .namespace(&endpoint_id.namespace)?
            .component(&endpoint_id.component)?;
        let client = component.endpoint(&endpoint_id.name).client().await?;
-
-        let Some(etcd_client) = self.drt.etcd_client() else {
-            // Should be impossible because we only get here on an etcd event
-            anyhow::bail!("Missing etcd_client");
-        };
-        let card = match model_entry.load_mdc(&etcd_client).await {
-            Ok(card) => {
-                tracing::debug!(card.display_name, "adding model");
-                Some(card)
+        let model_slug = model_entry.slug();
+        let card = match ModelDeploymentCard::load_from_store(&model_slug, &self.drt).await {
+            Ok(Some(card)) => card,
+            Ok(None) => {
+                anyhow::bail!("Missing ModelDeploymentCard in storage under key {model_slug}");
            }
            Err(err) => {
-                tracing::info!(error = %err, "load_mdc did not complete");
-                None
+                anyhow::bail!(
+                    "Error fetching ModelDeploymentCard from storage under key {model_slug}. {err}"
+                );
            }
        };

@@ -284,15 +282,6 @@ impl ModelWatcher {
            // A model that expects pre-processed requests meaning it's up to us whether we
            // handle Chat or Completions requests, so handle whatever the model supports.

-            let Some(mut card) = card else {
-                anyhow::bail!("Missing model deployment card");
-            };
-            // Download tokenizer.json etc to local disk
-            // This cache_dir is a tempfile::TempDir will be deleted on drop. I _think_
-            // OpenAIPreprocessor::new loads the files, so we can delete them after this
-            // function. Needs checking carefully, possibly we need to store it in state.
-            let _cache_dir = Some(card.move_from_nats(self.drt.nats_client()).await?);
-
            let kv_chooser = if self.router_mode == RouterMode::KV {
                Some(
                    self.manager
@@ -309,7 +298,7 @@ impl ModelWatcher {
            };

            // This is expensive, we are loading ~10MiB JSON, so only do it once
-            let tokenizer_hf = card.tokenizer_hf()?;
+            let tokenizer_hf = card.tokenizer_hf().context("tokenizer_hf")?;

            // Add chat engine only if the model supports chat
            if model_entry.model_type.supports_chat() {
@@ -324,9 +313,11 @@ impl ModelWatcher {
                    kv_chooser.clone(),
                    tokenizer_hf.clone(),
                )
-                .await?;
+                .await
+                .context("build_routed_pipeline")?;
                self.manager
-                    .add_chat_completions_model(&model_entry.name, chat_engine)?;
+                    .add_chat_completions_model(&model_entry.name, chat_engine)
+                    .context("add_chat_completions_model")?;
                tracing::info!("Chat completions is ready");
            }

@@ -338,7 +329,8 @@ impl ModelWatcher {
                    card.clone(),
                    formatter,
                    tokenizer_hf.clone(),
-                )?;
+                )
+                .context("OpenAIPreprocessor::new_with_parts")?;
                let completions_engine = entrypoint::build_routed_pipeline_with_preprocessor::<
                    NvCreateCompletionRequest,
                    NvCreateCompletionResponse,
@@ -351,9 +343,11 @@ impl ModelWatcher {
                    preprocessor,
                    tokenizer_hf,
                )
-                .await?;
+                .await
+                .context("build_routed_pipeline_with_preprocessor")?;
                self.manager
-                    .add_completions_model(&model_entry.name, completions_engine)?;
+                    .add_completions_model(&model_entry.name, completions_engine)
+                    .context("add_completions_model")?;
                tracing::info!("Completions is ready");
            }
        } else if model_entry.model_input == ModelInput::Text
@@ -388,12 +382,6 @@ impl ModelWatcher {
            && model_entry.model_type.supports_embedding()
        {
            // Case 4: Tokens + Embeddings
-            let Some(mut card) = card else {
-                anyhow::bail!("Missing model deployment card for embedding model");
-            };
-
-            // Download tokenizer files to local disk
-            let _cache_dir = Some(card.move_from_nats(self.drt.nats_client()).await?);

            // Create preprocessing pipeline similar to Backend
            let frontend = SegmentSource::<

--- a/lib/llm/src/local_model.rs
+++ b/lib/llm/src/local_model.rs
@@ -252,8 +252,10 @@ impl LocalModelBuilder {
        // --model-config takes precedence over --model-path
        let model_config_path = self.model_config.as_ref().unwrap_or(&full_path);

-        let mut card =
-            ModelDeploymentCard::load(model_config_path, self.custom_template_path.as_deref())?;
+        let mut card = ModelDeploymentCard::load_from_disk(
+            model_config_path,
+            self.custom_template_path.as_deref(),
+        )?;

        // Usually we infer from the path, self.model_name is user override
        let model_name = self.model_name.take().unwrap_or_else(|| {

--- a/lib/llm/src/model_card.rs
+++ b/lib/llm/src/model_card.rs
@@ -23,6 +23,8 @@ use crate::common::checked_file::CheckedFile;
 use crate::local_model::runtime_config::ModelRuntimeConfig;
 use anyhow::{Context, Result};
 use derive_builder::Builder;
+use dynamo_runtime::DistributedRuntime;
+use dynamo_runtime::storage::key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager};
 use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports::nats};
 use serde::{Deserialize, Serialize};
 use tokenizers::Tokenizer as HfTokenizer;
@@ -141,6 +143,9 @@ pub struct ModelDeploymentCard {

    #[serde(default)]
    pub runtime_config: ModelRuntimeConfig,
+
+    #[serde(skip)]
+    cache_dir: Option<Arc<tempfile::TempDir>>,
 }

 impl ModelDeploymentCard {
@@ -228,9 +233,9 @@ impl ModelDeploymentCard {
    pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
        match &self.tokenizer {
            Some(TokenizerKind::HfTokenizerJson(checked_file)) => {
-                let p = checked_file.path().ok_or_else(||
-                    anyhow::anyhow!("Tokenizer is URL-backed ({:?}); call move_from_nats() before tokenizer_hf()", checked_file.url())
-                )?;
+                let p = checked_file.path().ok_or_else(|| {
+                    anyhow::anyhow!("Tokenizer is URL-backed ({:?})", checked_file.url())
+                })?;
                HfTokenizer::from_file(p)
                    .inspect_err(|err| {
                        if let Some(serde_err) = err.downcast_ref::<serde_json::Error>()
@@ -240,6 +245,7 @@ impl ModelDeploymentCard {
                        }
                    })
                    .map_err(anyhow::Error::msg)
+                    .with_context(|| p.display().to_string())
            }
            Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
            None => {
@@ -308,7 +314,7 @@ impl ModelDeploymentCard {
    /// Updates the URI's to point to the created files.
    ///
    /// The returned TempDir must be kept alive, it cleans up on drop.
-    pub async fn move_from_nats(&mut self, nats_client: nats::Client) -> Result<tempfile::TempDir> {
+    async fn move_from_nats(&mut self, nats_client: nats::Client) -> Result<tempfile::TempDir> {
        let nats_addr = nats_client.addr();
        let bucket_name = self.slug();
        let target_dir = tempfile::TempDir::with_prefix(bucket_name.to_string())?;
@@ -388,7 +394,7 @@ impl ModelDeploymentCard {
    /// - a folder containing config.json, tokenizer.json and token_config.json
    /// - a GGUF file
    ///   With an optional custom template
-    pub fn load(
+    pub fn load_from_disk(
        config_path: impl AsRef<Path>,
        custom_template_path: Option<&Path>,
    ) -> anyhow::Result<ModelDeploymentCard> {
@@ -404,6 +410,29 @@ impl ModelDeploymentCard {
        }
    }

+    /// Load a ModelDeploymentCard from storage the DistributedRuntime is configured to use.
+    /// Card should be fully local and ready to use when the call returns.
+    pub async fn load_from_store(
+        model_slug: &Slug,
+        drt: &DistributedRuntime,
+    ) -> anyhow::Result<Option<Self>> {
+        let Some(etcd_client) = drt.etcd_client() else {
+            // Should be impossible because we only get here on an etcd event
+            anyhow::bail!("Missing etcd_client");
+        };
+        let store: Box<dyn KeyValueStore> = Box::new(EtcdStorage::new(etcd_client));
+        let card_store = Arc::new(KeyValueStoreManager::new(store));
+        let Some(mut card) = card_store
+            .load::<ModelDeploymentCard>(ROOT_PATH, model_slug)
+            .await?
+        else {
+            return Ok(None);
+        };
+        // This cache_dir is a tempfile::TempDir will be deleted on drop, so keep it alive.
+        card.cache_dir = Some(Arc::new(card.move_from_nats(drt.nats_client()).await?));
+        Ok(Some(card))
+    }
+
    /// Creates a ModelDeploymentCard from a local directory path.
    ///
    /// Currently HuggingFace format is supported and following files are expected:
@@ -474,6 +503,7 @@ impl ModelDeploymentCard {
            migration_limit: 0,
            user_data: None,
            runtime_config: ModelRuntimeConfig::default(),
+            cache_dir: None,
        })
    }

@@ -537,6 +567,7 @@ impl ModelDeploymentCard {
            migration_limit: 0,
            user_data: None,
            runtime_config: ModelRuntimeConfig::default(),
+            cache_dir: None,
        })
    }
 }

--- a/lib/llm/tests/backend.rs
+++ b/lib/llm/tests/backend.rs
@@ -6,7 +6,8 @@ use dynamo_llm::model_card::ModelDeploymentCard;

 #[test]
 fn test_sequence_factory() {
-    let mdc = ModelDeploymentCard::load("tests/data/sample-models/TinyLlama_v1.1", None).unwrap();
+    let mdc = ModelDeploymentCard::load_from_disk("tests/data/sample-models/TinyLlama_v1.1", None)
+        .unwrap();

    let operator = Backend::from_mdc(&mdc);


--- a/lib/llm/tests/model_card.rs
+++ b/lib/llm/tests/model_card.rs
@@ -8,7 +8,7 @@ const HF_PATH: &str = "tests/data/sample-models/TinyLlama_v1.1";

 #[tokio::test]
 async fn test_model_info_from_hf_like_local_repo() {
-    let mdc = ModelDeploymentCard::load(HF_PATH, None).unwrap();
+    let mdc = ModelDeploymentCard::load_from_disk(HF_PATH, None).unwrap();
    let info = mdc.model_info.unwrap().get_model_info().unwrap();
    assert_eq!(info.model_type(), "llama");
    assert_eq!(info.bos_token_id(), 1);
@@ -20,13 +20,13 @@ async fn test_model_info_from_hf_like_local_repo() {
 #[tokio::test]
 async fn test_model_info_from_non_existent_local_repo() {
    let path = "tests/data/sample-models/this-model-does-not-exist";
-    let result = ModelDeploymentCard::load(path, None);
+    let result = ModelDeploymentCard::load_from_disk(path, None);
    assert!(result.is_err());
 }

 #[tokio::test]
 async fn test_tokenizer_from_hf_like_local_repo() {
-    let mdc = ModelDeploymentCard::load(HF_PATH, None).unwrap();
+    let mdc = ModelDeploymentCard::load_from_disk(HF_PATH, None).unwrap();
    // Verify tokenizer file was found
    match mdc.tokenizer.unwrap() {
        TokenizerKind::HfTokenizerJson(_) => (),
@@ -36,7 +36,7 @@ async fn test_tokenizer_from_hf_like_local_repo() {

 #[tokio::test]
 async fn test_prompt_formatter_from_hf_like_local_repo() {
-    let mdc = ModelDeploymentCard::load(HF_PATH, None).unwrap();
+    let mdc = ModelDeploymentCard::load_from_disk(HF_PATH, None).unwrap();
    // Verify prompt formatter was found
    match mdc.prompt_formatter {
        Some(PromptFormatterArtifact::HfTokenizerConfigJson(_)) => (),
@@ -48,7 +48,7 @@ async fn test_prompt_formatter_from_hf_like_local_repo() {
 async fn test_missing_required_files() {
    // Create empty temp directory
    let temp_dir = tempdir().unwrap();
-    let result = ModelDeploymentCard::load(temp_dir.path(), None);
+    let result = ModelDeploymentCard::load_from_disk(temp_dir.path(), None);
    assert!(result.is_err());
    let err = result.unwrap_err().to_string();
    // Should fail because config.json is missing

--- a/lib/llm/tests/preprocessor.rs
+++ b/lib/llm/tests/preprocessor.rs
@@ -57,7 +57,7 @@ async fn make_mdc_from_repo(
    //TODO: remove this once we have nim-hub support. See the NOTE above.
    let downloaded_path = maybe_download_model(local_path, hf_repo, hf_revision).await;
    let display_name = format!("{}--{}", hf_repo, hf_revision);
-    let mut mdc = ModelDeploymentCard::load(downloaded_path, None).unwrap();
+    let mut mdc = ModelDeploymentCard::load_from_disk(downloaded_path, None).unwrap();
    mdc.set_name(&display_name);
    mdc.prompt_context = mixins;
    mdc