chore: Publish Model Deployment Card to NATS (#799)

This will allow an ingress-side pre-processor to see it without needing a model checkout. Currently pre-processing is done in the worker, which has access to the model deployment card ("MDC") files (`config.json`, `tokenizer.json` and `tokenizer_config.json`) locally. We want to move the pre-processor to the ingress side to support KV routing. That requires ingress side (i.e the HTTP server), on a different machine than the worker to be able to see those three files. To support that this PR makes the worker upload the contents of those files to the NATS object store, and publishes the MDC with those NATS urls to the key-value store. The key-value store has an interface so any store (nats, etcd, redis, etc) can be supported. Implementations for memory and NATS are provided. Fetching the MDC from the store, doing pre-processing ingress side, and publishing a card backed by a GGUF, are all for a later commit. Part of #743

chore: Publish Model Deployment Card to NATS (#799)
This will allow an ingress-side pre-processor to see it without needing a model checkout. Currently pre-processing is done in the worker, which has access to the model deployment card ("MDC") files (`config.json`, `tokenizer.json` and `tokenizer_config.json`) locally. We want to move the pre-processor to the ingress side to support KV routing. That requires ingress side (i.e the HTTP server), on a different machine than the worker to be able to see those three files. To support that this PR makes the worker upload the contents of those files to the NATS object store, and publishes the MDC with those NATS urls to the key-value store. The key-value store has an interface so any store (nats, etcd, redis, etc) can be supported. Implementations for memory and NATS are provided. Fetching the MDC from the store, doing pre-processing ingress side, and publishing a card backed by a GGUF, are all for a later commit. Part of #743
d346782c · Graham King · GitHub · 16310b26 · d346782c · d346782c
Unverified Commit d346782c authored Apr 25, 2025 by Graham King Committed by GitHub Apr 25, 2025
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1600,6 +1600,7 @@ version = "0.1.1"
 dependencies = [
 "akin",
 "anyhow",
+ "async-nats",
 "async-openai",
 "async-stream",
 "async-trait",
@@ -1649,6 +1650,7 @@ dependencies = [
 "toktrie_hf_tokenizers 0.6.31",
 "tracing",
 "unicode-segmentation",
+ "url",
 "uuid 1.16.0",
 "validator",
 "xxhash-rust",
@@ -1729,6 +1731,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "url",
 "uuid 1.16.0",
 "validator",
 "xxhash-rust",
@@ -6915,6 +6918,7 @@ dependencies = [
 "form_urlencoded",
 "idna",
 "percent-encoding",
+ "serde",
 ]

 [[package]]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -60,6 +60,7 @@ prometheus = { version = "0.14" }
 rand = { version = "0.9.0" }
 serde = { version = "1", features = ["derive"] }
 serde_json = { version = "1" }
+strum = { version = "0.27", features = ["derive"] }
 thiserror = { version = "2.0.11" }
 tokio = { version = "1", features = ["full"] }
 tokio-stream = { version = "0.1" }
@@ -68,8 +69,8 @@ tracing = { version = "0.1" }
 tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
 validator = { version = "0.20.0", features = ["derive"] }
 uuid = { version = "1", features = ["v4", "serde"] }
+url = {version = "2.5", features = ["serde"]}
 xxhash-rust = { version = "0.8", features = ["xxh3", "const_xxh3"] }
-strum = { version = "0.27", features = ["derive"] }

 [profile.dev.package]
 insta.opt-level = 3

--- a/launch/dynamo-run/src/input/common.rs
+++ b/launch/dynamo-run/src/input/common.rs
@@ -66,6 +66,7 @@ pub async fn prepare_engine(
        EngineConfig::StaticFull {
            service_name,
            engine,
+            card: _card,
        } => {
            tracing::debug!("Model: {service_name}");
            Ok((service_name, engine, false))

--- a/launch/dynamo-run/src/input/endpoint.rs
+++ b/launch/dynamo-run/src/input/endpoint.rs
@@ -13,9 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::sync::Arc;
+
 use dynamo_llm::{
    backend::Backend,
    http::service::discovery::ModelEntry,
+    key_value_store::{KeyValueStore, KeyValueStoreManager, NATSStorage},
+    model_card::{BUCKET_NAME, BUCKET_TTL},
    model_type::ModelType,
    preprocessor::OpenAIPreprocessor,
    types::{
@@ -44,11 +48,12 @@ pub async fn run(

    let etcd_client = distributed_runtime.etcd_client();

-    let (ingress, service_name) = match engine_config {
+    let (ingress, service_name, mut card) = match engine_config {
        EngineConfig::StaticFull {
            service_name,
            engine,
-        } => (Ingress::for_engine(engine)?, service_name),
+            card,
+        } => (Ingress::for_engine(engine)?, service_name, card),
        EngineConfig::StaticCore {
            service_name,
            engine: inner_engine,
@@ -72,7 +77,7 @@ pub async fn run(
                .link(preprocessor.backward_edge())?
                .link(frontend)?;

-            (Ingress::for_pipeline(pipeline)?, service_name)
+            (Ingress::for_pipeline(pipeline)?, service_name, card)
        }
        EngineConfig::Dynamic(_) => {
            anyhow::bail!("Cannot use endpoint for both in and out");
@@ -87,13 +92,29 @@ pub async fn run(
    };

    let component = distributed_runtime
-        .namespace(endpoint_id.namespace)?
-        .component(endpoint_id.component)?;
+        .namespace(&endpoint_id.namespace)?
+        .component(&endpoint_id.component)?;
    let endpoint = component
        .service_builder()
        .create()
        .await?
-        .endpoint(endpoint_id.name);
+        .endpoint(&endpoint_id.name);
+
+    let nats_client = distributed_runtime.nats_client();
+    card.move_to_nats(nats_client.clone()).await?;
+
+    let kvstore: Box<dyn KeyValueStore> =
+        Box::new(NATSStorage::new(nats_client.clone(), endpoint_id));
+    let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
+    card.requires_preprocessing = false;
+    card_store.publish_until_cancelled(
+        cancel_token.clone(),
+        BUCKET_NAME.to_string(),
+        Some(BUCKET_TTL),
+        BUCKET_TTL / 2,
+        card.slug().to_string(),
+        *card.clone(),
+    );

    if let Some(etcd_client) = etcd_client {
        let network_name = endpoint.subject_to(etcd_client.lease_id());
@@ -115,5 +136,9 @@ pub async fn run(
        _ = cancel_token.cancelled() => {
        }
    }
+    // Cleanup on shutdown
+    if let Err(err) = card.delete_from_nats(nats_client).await {
+        tracing::error!(%err, "delete_from_nats error on shutdown");
+    }
    Ok(())
 }
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -61,6 +61,7 @@ pub enum EngineConfig {
    StaticFull {
        service_name: String,
        engine: OpenAIChatCompletionsStreamingEngine,
+        card: Box<ModelDeploymentCard>,
    },

    /// A core engine expects to be wrapped with pre/post processors that handle tokenization.
@@ -173,7 +174,9 @@ pub async fn run(
        }
        // Otherwise we don't have one, but we only need it if we're tokenizing
        _ => {
-            tracing::debug!("No model card path provided (neither --model-config nor a directory in --model-path)");
+            tracing::debug!(
+                "No model card path provided (neither --model-config nor --model-path)"
+            );
            None
        }
    };
@@ -203,6 +206,7 @@ pub async fn run(
                );
            };
            EngineConfig::StaticFull {
+                card: Box::new(ModelDeploymentCard::with_name_only(&model_name)),
                service_name: model_name,
                engine: dynamo_llm::engines::make_engine_full(),
            }
@@ -233,6 +237,7 @@ pub async fn run(
                unreachable!("We checked model_path earlier, and set model_name from model_path");
            };
            EngineConfig::StaticFull {
+                card: Box::new(ModelDeploymentCard::with_name_only(&model_name)),
                service_name: model_name,
                engine: dynamo_engine_mistralrs::make_engine(&model_path).await?,
            }
@@ -422,16 +427,17 @@ pub async fn run(
        }
        #[cfg(feature = "python")]
        Output::PythonStr(path_str) => {
-            let Some(model_name) = model_name else {
+            let Some(model_name) = &model_name else {
                anyhow::bail!("Provide model service name as `--model-name <this>`");
            };
-            let py_args = flags.as_vec(&path_str, &model_name);
+            let py_args = flags.as_vec(&path_str, model_name);
            let p = std::path::PathBuf::from(path_str);
            let engine =
                dynamo_engine_python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
            EngineConfig::StaticFull {
-                service_name: model_name,
+                service_name: model_name.to_string(),
                engine,
+                card: Box::new(ModelDeploymentCard::with_name_only(model_name)),
            }
        }
        #[cfg(feature = "python")]

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -1032,6 +1032,7 @@ version = "0.1.1"
 dependencies = [
 "akin",
 "anyhow",
+ "async-nats",
 "async-openai",
 "async-stream",
 "async-trait",
@@ -1071,6 +1072,7 @@ dependencies = [
 "toktrie_hf_tokenizers",
 "tracing",
 "unicode-segmentation",
+ "url",
 "uuid",
 "validator",
 "xxhash-rust",
@@ -1136,6 +1138,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "url",
 "uuid",
 "validator",
 "xxhash-rust",
@@ -4803,6 +4806,7 @@ dependencies = [
 "form_urlencoded",
 "idna",
 "percent-encoding",
+ "serde",
 ]

 [[package]]

--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -38,6 +38,7 @@ dynamo-runtime = { workspace = true }
 anyhow = { workspace = true }
 async-stream = { workspace = true }
 async-trait = { workspace = true }
+async-nats = { workspace = true }
 async_zmq = { workspace = true }
 bytes = { workspace = true }
 chrono = { workspace = true }
@@ -54,6 +55,7 @@ tokio-stream = { workspace = true }
 tokio-util = { workspace = true }
 tracing = { workspace = true }
 validator = { workspace = true }
+url = { workspace = true }
 uuid = { workspace = true }
 xxhash-rust = { workspace = true }
 strum = { workspace = true }

--- a/lib/llm/src/backend.rs
+++ b/lib/llm/src/backend.rs
@@ -61,7 +61,7 @@ pub type ExecutionContext = ServerStreamingEngine<BackendInput, ExecutionOutputS
 /// Backend handles resource management and orchestrates LLM execution
 #[allow(dead_code)]
 pub struct Backend {
-    pub tokenizer: Tokenizer,     // Handles token encoding/decoding
+    pub tokenizer: Option<Tokenizer>, // Handles token encoding/decoding
    validate_engine_decode: bool,     // Enable validation of engine decoding
 }

@@ -79,17 +79,23 @@ impl Backend {
        let tokenizer = Tokenizer::from(Arc::new(tokenizer));

        Ok(Arc::new(Self {
-            tokenizer,
+            tokenizer: Some(tokenizer),
            validate_engine_decode: false,
        }))
    }

    pub async fn from_mdc(mdc: ModelDeploymentCard) -> Result<Arc<Self>> {
        let tokenizer = match &mdc.tokenizer {
-            TokenizerKind::HfTokenizerJson(file) => {
+            Some(TokenizerKind::HfTokenizerJson(file)) => {
                HfTokenizer::from_file(file).map_err(Error::msg)?
            }
-            TokenizerKind::GGUF(t) => *t.clone(),
+            Some(TokenizerKind::GGUF(t)) => *t.clone(),
+            None => {
+                return Ok(Arc::new(Self {
+                    tokenizer: None,
+                    validate_engine_decode: false,
+                }));
+            }
        };
        Self::from_tokenizer(tokenizer).await
    }
@@ -98,14 +104,17 @@ impl Backend {
        &self,
        stream: ManyOut<ExecutionOutputStream>,
        stop_conditions: StopConditions,
-    ) -> DecoderUnfoldState {
-        let decoder = Decoder::new(self.tokenizer.decode_stream(false), stop_conditions);
+    ) -> anyhow::Result<DecoderUnfoldState> {
+        let Some(tokenizer) = self.tokenizer.as_ref() else {
+            anyhow::bail!("Backend built from blank ModelDeploymentCard, no tokenizer");
+        };
+        let decoder = Decoder::new(tokenizer.decode_stream(false), stop_conditions);

-        DecoderUnfoldState {
+        Ok(DecoderUnfoldState {
            stream,
            decoder,
            validate_engine_decode: self.validate_engine_decode,
-        }
+        })
    }
 }

@@ -127,7 +136,7 @@ impl
        let next_stream = next.generate(request).await?;

        let context = next_stream.context();
-        let state = self.decoder(next_stream, stop_conditions);
+        let state = self.decoder(next_stream, stop_conditions)?;

        let processed_stream = stream::unfold(state, |mut state| async move {
            match state.stream.next().await {

--- a/lib/llm/src/key_value_store.rs
+++ b/lib/llm/src/key_value_store.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Interface to a traditional key-value store such as etcd.
+//! "key_value_store" spelt out because in AI land "KV" means something else.
+
+use std::collections::HashMap;
+use std::fmt;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use dynamo_runtime::slug::Slug;
+use dynamo_runtime::CancellationToken;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+
+mod mem;
+pub use mem::MemoryStorage;
+mod nats;
+pub use nats::NATSStorage;
+
+#[async_trait]
+pub trait KeyValueStore: Send + Sync {
+    async fn get_or_create_bucket(
+        &self,
+        bucket_name: &str,
+        // auto-delete items older than this
+        ttl: Option<Duration>,
+    ) -> Result<Box<dyn KeyValueBucket>, StorageError>;
+
+    async fn get_bucket(
+        &self,
+        bucket_name: &str,
+    ) -> Result<Option<Box<dyn KeyValueBucket>>, StorageError>;
+}
+
+pub struct KeyValueStoreManager(Box<dyn KeyValueStore>);
+
+impl KeyValueStoreManager {
+    pub fn new(s: Box<dyn KeyValueStore>) -> KeyValueStoreManager {
+        KeyValueStoreManager(s)
+    }
+
+    pub async fn load<T: for<'a> Deserialize<'a>>(
+        &self,
+        bucket: &str,
+        key: &Slug,
+    ) -> Result<Option<T>, StorageError> {
+        let Some(bucket) = self.0.get_bucket(bucket).await? else {
+            // No bucket means no cards
+            return Ok(None);
+        };
+        match bucket.get(key.as_ref()).await {
+            Ok(Some(card_bytes)) => {
+                let card: T = serde_json::from_slice(card_bytes.as_ref())?;
+                Ok(Some(card))
+            }
+            Ok(None) => Ok(None),
+            Err(err) => {
+                // TODO look at what errors NATS can give us and make more specific wrappers
+                Err(StorageError::NATSError(err.to_string()))
+            }
+        }
+    }
+
+    /// Returns a receiver that will receive all the existing keys, and
+    /// then block and receive new keys as they are created.
+    /// Starts a task that runs forever, watches the store.
+    pub fn watch<T: for<'a> Deserialize<'a> + Send + 'static>(
+        self: Arc<Self>,
+        bucket_name: &str,
+        bucket_ttl: Option<Duration>,
+    ) -> (
+        tokio::task::JoinHandle<Result<(), StorageError>>,
+        tokio::sync::mpsc::UnboundedReceiver<T>,
+    ) {
+        let bucket_name = bucket_name.to_string();
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        let watch_task = tokio::spawn(async move {
+            // Start listening for changes but don't poll this yet
+            let bucket = self
+                .0
+                .get_or_create_bucket(&bucket_name, bucket_ttl)
+                .await?;
+            let mut stream = bucket.watch().await?;
+
+            // Send all the existing keys
+            for (_, card_bytes) in bucket.entries().await? {
+                let card: T = serde_json::from_slice(card_bytes.as_ref())?;
+                let _ = tx.send(card);
+            }
+
+            // Now block waiting for new entries
+            while let Some(card_bytes) = stream.next().await {
+                let card: T = serde_json::from_slice(card_bytes.as_ref())?;
+                let _ = tx.send(card);
+            }
+
+            Ok::<(), StorageError>(())
+        });
+        (watch_task, rx)
+    }
+
+    pub async fn publish<T: Serialize + Versioned + Send + Sync>(
+        &self,
+        bucket_name: &str,
+        bucket_ttl: Option<Duration>,
+        key: &str,
+        obj: &mut T,
+    ) -> anyhow::Result<StorageOutcome> {
+        let obj_json = serde_json::to_string(obj)?;
+        let bucket = self.0.get_or_create_bucket(bucket_name, bucket_ttl).await?;
+
+        let outcome = bucket
+            .insert(key.to_string(), obj_json, obj.revision())
+            .await?;
+
+        match outcome {
+            StorageOutcome::Created(revision) | StorageOutcome::Exists(revision) => {
+                obj.set_revision(revision);
+            }
+        }
+        Ok(outcome)
+    }
+
+    /// Re-publish the model card to the store regularly. Spawns a task and returns.
+    /// Takes most arguments by value because it will hold on to them in the publish task.
+    /// Deletes the card on cancellation.
+    pub fn publish_until_cancelled<T: Serialize + Versioned + Send + Sync + 'static>(
+        self: Arc<Self>,
+        cancel_token: CancellationToken,
+        bucket_name: String,
+        bucket_ttl: Option<Duration>,
+        publish_interval: Duration,
+        key: String,
+        mut obj: T,
+    ) {
+        tokio::spawn(async move {
+            loop {
+                let publish_result = self
+                    .clone()
+                    .publish(&bucket_name, bucket_ttl, &key, &mut obj)
+                    .await;
+                if let Err(err) = publish_result {
+                    tracing::error!(
+                        model = key,
+                        error = %err,
+                        "Failed publishing to KV storage. Ending publish task.",
+                    );
+                }
+                tokio::select! {
+                    _ = tokio::time::sleep(publish_interval) => {},
+                    _ = cancel_token.cancelled() => {
+                        tracing::trace!(model_service_name = key, "Publish loop cancelled");
+                        match self.0.get_bucket(&bucket_name).await {
+                            Ok(Some(bucket)) => {
+                                if let Err(err) = bucket.delete(&key).await {
+                                    // This is usually expected, our NATS connection is closed
+                                    tracing::trace!(bucket_name, key, %err, "Error delete published card from NATS on publish stop");
+                                }
+
+                                tracing::trace!(bucket_name, key, "Deleted Model Deployment Card from NATS");
+                            }
+                            Ok(None) => {
+                                tracing::trace!(bucket_name, key, "Bucket does not exist");
+                            }
+                            Err(err) => {
+                                tracing::trace!(bucket_name, %err, "publish_until_cancelled shutdown error");
+                            }
+                        }
+                        // Stop publishing
+                        break;
+                    }
+                }
+            }
+        });
+    }
+}
+
+/// An online storage for key-value config values.
+/// Usually backed by `nats-server`.
+#[async_trait]
+pub trait KeyValueBucket: Send {
+    /// A bucket is a collection of key/value pairs.
+    /// Insert a value into a bucket, if it doesn't exist already
+    async fn insert(
+        &self,
+        key: String,
+        value: String,
+        revision: u64,
+    ) -> Result<StorageOutcome, StorageError>;
+
+    /// Fetch an item from the key-value storage
+    async fn get(&self, key: &str) -> Result<Option<bytes::Bytes>, StorageError>;
+
+    /// Delete an item from the bucket
+    async fn delete(&self, key: &str) -> Result<(), StorageError>;
+
+    /// A stream of items inserted into the bucket.
+    /// Every time the stream is polled it will either return a newly created entry, or block until
+    /// such time.
+    async fn watch(
+        &self,
+    ) -> Result<Pin<Box<dyn futures::Stream<Item = bytes::Bytes> + Send + 'life0>>, StorageError>;
+
+    async fn entries(&self) -> Result<HashMap<String, bytes::Bytes>, StorageError>;
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum StorageOutcome {
+    /// The operation succeeded and created a new entry with this revision.
+    /// Note that "create" also means update, because each new revision is a "create".
+    Created(u64),
+    /// The operation did not do anything, the value was already present, with this revision.
+    Exists(u64),
+}
+impl fmt::Display for StorageOutcome {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            StorageOutcome::Created(revision) => write!(f, "Created at {revision}"),
+            StorageOutcome::Exists(revision) => write!(f, "Exists at {revision}"),
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum StorageError {
+    #[error("Could not find bucket '{0}'")]
+    MissingBucket(String),
+
+    #[error("Could not find key '{0}'")]
+    MissingKey(String),
+
+    #[error("Internal storage error: '{0}'")]
+    ProviderError(String),
+
+    #[error("Internal NATS error: {0}")]
+    NATSError(String),
+
+    #[error("Internal etcd error: {0}")]
+    EtcdError(String),
+
+    #[error("Key Value Error: {0} for bucket '{1}")]
+    KeyValueError(String, String),
+
+    #[error("Error decoding bytes: {0}")]
+    JSONDecodeError(#[from] serde_json::error::Error),
+
+    #[error("Race condition, retry the call")]
+    Retry,
+}
+
+/// A trait allowing to get/set a revision on an object.
+/// NATS uses this to ensure atomic updates.
+pub trait Versioned {
+    fn revision(&self) -> u64;
+    fn set_revision(&mut self, r: u64);
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use futures::{pin_mut, StreamExt};
+
+    const BUCKET_NAME: &str = "mdc";
+
+    /// Convert the value returned by `watch()` into a broadcast stream that multiple
+    /// clients can listen to.
+    #[allow(dead_code)]
+    pub struct TappableStream {
+        tx: tokio::sync::broadcast::Sender<bytes::Bytes>,
+    }
+
+    #[allow(dead_code)]
+    impl TappableStream {
+        async fn new<T>(stream: T, max_size: usize) -> Self
+        where
+            T: futures::Stream<Item = bytes::Bytes> + Send + 'static,
+        {
+            let (tx, _) = tokio::sync::broadcast::channel(max_size);
+            let tx2 = tx.clone();
+            tokio::spawn(async move {
+                pin_mut!(stream);
+                while let Some(x) = stream.next().await {
+                    let _ = tx2.send(x);
+                }
+            });
+            TappableStream { tx }
+        }
+
+        fn subscribe(&self) -> tokio::sync::broadcast::Receiver<bytes::Bytes> {
+            self.tx.subscribe()
+        }
+    }
+
+    fn init() {
+        dynamo_runtime::logging::init();
+    }
+
+    #[tokio::test]
+    async fn test_memory_storage() -> anyhow::Result<()> {
+        init();
+
+        let s = Arc::new(MemoryStorage::new());
+        let s2 = Arc::clone(&s);
+
+        let bucket = s.get_or_create_bucket(BUCKET_NAME, None).await?;
+        let res = bucket
+            .insert("test1".to_string(), "value1".to_string(), 0)
+            .await?;
+        assert_eq!(res, StorageOutcome::Created(0));
+
+        let (got_first_tx, got_first_rx) = tokio::sync::oneshot::channel();
+        let ingress = tokio::spawn(async move {
+            let b2 = s2.get_or_create_bucket(BUCKET_NAME, None).await?;
+            let mut stream = b2.watch().await?;
+
+            // Put in before starting the watch-all
+            let v = stream.next().await.unwrap();
+            assert_eq!(v, "value1".as_bytes());
+
+            got_first_tx.send(()).unwrap();
+
+            // Put in after
+            let v = stream.next().await.unwrap();
+            assert_eq!(v, "value2".as_bytes());
+            let v = stream.next().await.unwrap();
+            assert_eq!(v, "value3".as_bytes());
+
+            Ok::<_, StorageError>(())
+        });
+
+        // MemoryStorage uses a HashMap with no inherent ordering, so we must ensure test1 is
+        // fetched before test2 is inserted, otherwise they can come out in any order, and we
+        // wouldn't be testing the watch behavior.
+        got_first_rx.await?;
+
+        let res = bucket
+            .insert("test2".to_string(), "value2".to_string(), 0)
+            .await?;
+        assert_eq!(res, StorageOutcome::Created(0));
+
+        // Repeat a key and revision. Ignored.
+        let res = bucket
+            .insert("test2".to_string(), "value2".to_string(), 0)
+            .await?;
+        assert_eq!(res, StorageOutcome::Exists(0));
+
+        // Increment revision
+        let res = bucket
+            .insert("test2".to_string(), "value2".to_string(), 1)
+            .await?;
+        assert_eq!(res, StorageOutcome::Created(1));
+
+        let res = bucket
+            .insert("test3".to_string(), "value3".to_string(), 0)
+            .await?;
+        assert_eq!(res, StorageOutcome::Created(0));
+
+        // ingress exits once it has received all values
+        let _ = ingress.await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_broadcast_stream() -> anyhow::Result<()> {
+        init();
+
+        let s: &'static _ = Box::leak(Box::new(MemoryStorage::new()));
+        let bucket: &'static _ =
+            Box::leak(Box::new(s.get_or_create_bucket(BUCKET_NAME, None).await?));
+
+        let res = bucket
+            .insert("test1".to_string(), "value1".to_string(), 0)
+            .await?;
+        assert_eq!(res, StorageOutcome::Created(0));
+
+        let stream = bucket.watch().await?;
+        let tap = TappableStream::new(stream, 10).await;
+
+        let mut rx1 = tap.subscribe();
+        let mut rx2 = tap.subscribe();
+
+        let handle1 = tokio::spawn(async move {
+            let b = rx1.recv().await.unwrap();
+            assert_eq!(b, bytes::Bytes::from(vec![b'G', b'K']));
+        });
+        let handle2 = tokio::spawn(async move {
+            let b = rx2.recv().await.unwrap();
+            assert_eq!(b, bytes::Bytes::from(vec![b'G', b'K']));
+        });
+
+        bucket
+            .insert("test1".to_string(), "GK".to_string(), 1)
+            .await?;
+
+        let _ = futures::join!(handle1, handle2);
+        Ok(())
+    }
+}
--- a/lib/llm/src/key_value_store/mem.rs
+++ b/lib/llm/src/key_value_store/mem.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::hash_map::Entry;
+use std::collections::{HashMap, HashSet};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
+use tokio::sync::Mutex;
+
+use super::{KeyValueBucket, KeyValueStore, StorageError, StorageOutcome};
+
+#[derive(Clone)]
+pub struct MemoryStorage {
+    inner: Arc<MemoryStorageInner>,
+}
+
+impl Default for MemoryStorage {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+struct MemoryStorageInner {
+    data: Mutex<HashMap<String, MemoryBucket>>,
+    change_sender: UnboundedSender<(String, String)>,
+    change_receiver: Mutex<UnboundedReceiver<(String, String)>>,
+}
+
+pub struct MemoryBucketRef {
+    name: String,
+    inner: Arc<MemoryStorageInner>,
+}
+
+struct MemoryBucket {
+    data: HashMap<String, (u64, String)>,
+}
+
+impl MemoryBucket {
+    fn new() -> Self {
+        MemoryBucket {
+            data: HashMap::new(),
+        }
+    }
+}
+
+impl MemoryStorage {
+    pub fn new() -> Self {
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        MemoryStorage {
+            inner: Arc::new(MemoryStorageInner {
+                data: Mutex::new(HashMap::new()),
+                change_sender: tx,
+                change_receiver: Mutex::new(rx),
+            }),
+        }
+    }
+}
+
+#[async_trait]
+impl KeyValueStore for MemoryStorage {
+    async fn get_or_create_bucket(
+        &self,
+        bucket_name: &str,
+        // MemoryStorage doesn't respect TTL yet
+        _ttl: Option<Duration>,
+    ) -> Result<Box<dyn KeyValueBucket>, StorageError> {
+        let mut locked_data = self.inner.data.lock().await;
+        // Ensure the bucket exists
+        locked_data
+            .entry(bucket_name.to_string())
+            .or_insert_with(MemoryBucket::new);
+        // Return an object able to access it
+        Ok(Box::new(MemoryBucketRef {
+            name: bucket_name.to_string(),
+            inner: self.inner.clone(),
+        }))
+    }
+
+    /// This operation cannot fail on MemoryStorage. Always returns Ok.
+    async fn get_bucket(
+        &self,
+        bucket_name: &str,
+    ) -> Result<Option<Box<dyn KeyValueBucket>>, StorageError> {
+        let locked_data = self.inner.data.lock().await;
+        match locked_data.get(bucket_name) {
+            Some(_) => Ok(Some(Box::new(MemoryBucketRef {
+                name: bucket_name.to_string(),
+                inner: self.inner.clone(),
+            }))),
+            None => Ok(None),
+        }
+    }
+}
+
+#[async_trait]
+impl KeyValueBucket for MemoryBucketRef {
+    async fn insert(
+        &self,
+        key: String,
+        value: String,
+        revision: u64,
+    ) -> Result<StorageOutcome, StorageError> {
+        let mut locked_data = self.inner.data.lock().await;
+        let mut b = locked_data.get_mut(&self.name);
+        let Some(bucket) = b.as_mut() else {
+            return Err(StorageError::MissingBucket(self.name.to_string()));
+        };
+        let outcome = match bucket.data.entry(key.to_string()) {
+            Entry::Vacant(e) => {
+                e.insert((revision, value.clone()));
+                let _ = self.inner.change_sender.send((key, value));
+                StorageOutcome::Created(revision)
+            }
+            Entry::Occupied(mut entry) => {
+                let (rev, _v) = entry.get();
+                if *rev == revision {
+                    StorageOutcome::Exists(revision)
+                } else {
+                    entry.insert((revision, value));
+                    StorageOutcome::Created(revision)
+                }
+            }
+        };
+        Ok(outcome)
+    }
+
+    async fn get(&self, key: &str) -> Result<Option<bytes::Bytes>, StorageError> {
+        let locked_data = self.inner.data.lock().await;
+        let Some(bucket) = locked_data.get(&self.name) else {
+            return Ok(None);
+        };
+        Ok(bucket
+            .data
+            .get(key)
+            .map(|(_, v)| bytes::Bytes::from(v.clone())))
+    }
+
+    async fn delete(&self, key: &str) -> Result<(), StorageError> {
+        let mut locked_data = self.inner.data.lock().await;
+        let Some(bucket) = locked_data.get_mut(&self.name) else {
+            return Err(StorageError::MissingBucket(self.name.to_string()));
+        };
+        bucket.data.remove(key);
+        Ok(())
+    }
+
+    /// All current values in the bucket first, then block waiting for new
+    /// values to be published.
+    /// Caller takes the lock so only a single caller may use this at once.
+    async fn watch(
+        &self,
+    ) -> Result<Pin<Box<dyn futures::Stream<Item = bytes::Bytes> + Send + 'life0>>, StorageError>
+    {
+        Ok(Box::pin(async_stream::stream! {
+            // All the existing ones first
+            let mut seen = HashSet::new();
+            let data_lock = self.inner.data.lock().await;
+            let Some(bucket) = data_lock.get(&self.name) else {
+                tracing::error!(bucket_name = self.name, "watch: Missing bucket");
+                return;
+            };
+            for (_rev, v) in bucket.data.values() {
+                seen.insert(v.clone());
+                yield bytes::Bytes::from(v.clone());
+            }
+            drop(data_lock);
+            // Now any new ones
+            let mut rcv_lock = self.inner.change_receiver.lock().await;
+            loop {
+                match rcv_lock.recv().await {
+                    None => {
+                        // Channel is closed, no more values coming
+                        break;
+                    },
+                    Some((_k, v)) => {
+                        if seen.contains(&v) {
+                            continue;
+                        }
+                        yield bytes::Bytes::from(v.clone());
+                    }
+                }
+            }
+        }))
+    }
+
+    async fn entries(&self) -> Result<HashMap<String, bytes::Bytes>, StorageError> {
+        let locked_data = self.inner.data.lock().await;
+        match locked_data.get(&self.name) {
+            Some(bucket) => Ok(bucket
+                .data
+                .iter()
+                .map(|(k, (_rev, v))| (k.to_string(), bytes::Bytes::from(v.clone())))
+                .collect()),
+            None => Err(StorageError::MissingBucket(self.name.clone())),
+        }
+    }
+}
--- a/lib/llm/src/key_value_store/nats.rs
+++ b/lib/llm/src/key_value_store/nats.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::{collections::HashMap, pin::Pin, time::Duration};
+
+use async_trait::async_trait;
+use dynamo_runtime::{protocols::Endpoint, slug::Slug, transports::nats::Client};
+use futures::StreamExt;
+
+use super::{KeyValueBucket, KeyValueStore, StorageError, StorageOutcome};
+
+#[derive(Clone)]
+pub struct NATSStorage {
+    client: Client,
+    endpoint: Endpoint,
+}
+
+pub struct NATSBucket {
+    nats_store: async_nats::jetstream::kv::Store,
+}
+
+#[async_trait]
+impl KeyValueStore for NATSStorage {
+    async fn get_or_create_bucket(
+        &self,
+        bucket_name: &str,
+        ttl: Option<Duration>,
+    ) -> Result<Box<dyn KeyValueBucket>, StorageError> {
+        let name = Slug::slugify(bucket_name);
+        let nats_store = self
+            .get_or_create_key_value(&self.endpoint.namespace, &name, ttl)
+            .await?;
+        Ok(Box::new(NATSBucket { nats_store }))
+    }
+
+    async fn get_bucket(
+        &self,
+        bucket_name: &str,
+    ) -> Result<Option<Box<dyn KeyValueBucket>>, StorageError> {
+        let name = Slug::slugify(bucket_name);
+        match self.get_key_value(&self.endpoint.namespace, &name).await? {
+            Some(nats_store) => Ok(Some(Box::new(NATSBucket { nats_store }))),
+            None => Ok(None),
+        }
+    }
+}
+
+impl NATSStorage {
+    pub fn new(client: Client, endpoint: Endpoint) -> Self {
+        NATSStorage { client, endpoint }
+    }
+
+    /// Get or create a key-value store (aka bucket) in NATS.
+    ///
+    /// ttl is only used if we are creating the bucket, so if that has
+    /// changed first delete the bucket.
+    async fn get_or_create_key_value(
+        &self,
+        namespace: &str,
+        bucket_name: &Slug,
+        // Delete entries older than this
+        ttl: Option<Duration>,
+    ) -> Result<async_nats::jetstream::kv::Store, StorageError> {
+        if let Ok(Some(kv)) = self.get_key_value(namespace, bucket_name).await {
+            return Ok(kv);
+        }
+
+        // It doesn't exist, create it
+
+        let bucket_name = single_name(namespace, bucket_name);
+        let js = self.client.jetstream();
+        let create_result = js
+            .create_key_value(
+                // TODO: configure the bucket, probably need to pass some of these values in
+                async_nats::jetstream::kv::Config {
+                    bucket: bucket_name.clone(),
+                    max_age: ttl.unwrap_or_default(),
+                    ..Default::default()
+                },
+            )
+            .await;
+        tracing::debug!("Created bucket {bucket_name}");
+        create_result.map_err(|err| StorageError::KeyValueError(err.to_string(), bucket_name))
+    }
+
+    async fn get_key_value(
+        &self,
+        namespace: &str,
+        bucket_name: &Slug,
+    ) -> Result<Option<async_nats::jetstream::kv::Store>, StorageError> {
+        let bucket_name = single_name(namespace, bucket_name);
+        let js = self.client.jetstream();
+
+        use async_nats::jetstream::context::KeyValueErrorKind;
+        match js.get_key_value(&bucket_name).await {
+            Ok(store) => Ok(Some(store)),
+            Err(err) if err.kind() == KeyValueErrorKind::GetBucket => {
+                // bucket doesn't exist
+                Ok(None)
+            }
+            Err(err) => Err(StorageError::KeyValueError(err.to_string(), bucket_name)),
+        }
+    }
+}
+
+#[async_trait]
+impl KeyValueBucket for NATSBucket {
+    async fn insert(
+        &self,
+        key: String,
+        value: String,
+        revision: u64,
+    ) -> Result<StorageOutcome, StorageError> {
+        if revision == 0 {
+            self.create(key, value).await
+        } else {
+            self.update(key, value, revision).await
+        }
+    }
+
+    async fn get(&self, key: &str) -> Result<Option<bytes::Bytes>, StorageError> {
+        self.nats_store
+            .get(key)
+            .await
+            .map_err(|e| StorageError::NATSError(e.to_string()))
+    }
+
+    async fn delete(&self, key: &str) -> Result<(), StorageError> {
+        self.nats_store
+            .delete(key)
+            .await
+            .map_err(|e| StorageError::NATSError(e.to_string()))
+    }
+
+    async fn watch(
+        &self,
+    ) -> Result<Pin<Box<dyn futures::Stream<Item = bytes::Bytes> + Send + 'life0>>, StorageError>
+    {
+        let watch_stream = self
+            .nats_store
+            .watch_all()
+            .await
+            .map_err(|e| StorageError::NATSError(e.to_string()))?;
+        // Map the `Entry` to `Entry.value` which is Bytes of the stored value.
+        Ok(Box::pin(
+            watch_stream.filter_map(
+                |maybe_entry: Result<
+                    async_nats::jetstream::kv::Entry,
+                    async_nats::error::Error<_>,
+                >| async move {
+                    match maybe_entry {
+                        Ok(entry) => Some(entry.value),
+                        Err(e) => {
+                            tracing::error!(error=%e, "watch fatal err");
+                            None
+                        }
+                    }
+                },
+            ),
+        ))
+    }
+
+    async fn entries(&self) -> Result<HashMap<String, bytes::Bytes>, StorageError> {
+        let mut key_stream = self
+            .nats_store
+            .keys()
+            .await
+            .map_err(|e| StorageError::NATSError(e.to_string()))?;
+        let mut out = HashMap::new();
+        while let Some(Ok(key)) = key_stream.next().await {
+            if let Ok(Some(entry)) = self.nats_store.entry(&key).await {
+                out.insert(key, entry.value);
+            }
+        }
+        Ok(out)
+    }
+}
+
+impl NATSBucket {
+    async fn create(&self, key: String, value: String) -> Result<StorageOutcome, StorageError> {
+        match self.nats_store.create(&key, value.into()).await {
+            Ok(revision) => Ok(StorageOutcome::Created(revision)),
+            Err(err) if err.kind() == async_nats::jetstream::kv::CreateErrorKind::AlreadyExists => {
+                // key exists, get the revsion
+                match self.nats_store.entry(&key).await {
+                    Ok(Some(entry)) => Ok(StorageOutcome::Exists(entry.revision)),
+                    Ok(None) => {
+                        tracing::error!(
+                            key,
+                            "Race condition, key deleted between create and fetch. Retry."
+                        );
+                        Err(StorageError::Retry)
+                    }
+                    Err(err) => Err(StorageError::NATSError(err.to_string())),
+                }
+            }
+            Err(err) => Err(StorageError::NATSError(err.to_string())),
+        }
+    }
+
+    async fn update(
+        &self,
+        key: String,
+        value: String,
+        revision: u64,
+    ) -> Result<StorageOutcome, StorageError> {
+        match self
+            .nats_store
+            .update(key.clone(), value.clone().into(), revision)
+            .await
+        {
+            Ok(revision) => Ok(StorageOutcome::Created(revision)),
+            Err(err)
+                if err.kind() == async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision =>
+            {
+                tracing::warn!(revision, key, "Update WrongLastRevision, resync");
+                self.resync_update(key, value).await
+            }
+            Err(err) => Err(StorageError::NATSError(err.to_string())),
+        }
+    }
+
+    /// We have the wrong revision for a key. Fetch it's entry to get the correct revision,
+    /// and try the update again.
+    async fn resync_update(
+        &self,
+        key: String,
+        value: String,
+    ) -> Result<StorageOutcome, StorageError> {
+        match self.nats_store.entry(&key).await {
+            Ok(Some(entry)) => {
+                // Re-try the update with new version number
+                let next_rev = entry.revision + 1;
+                match self
+                    .nats_store
+                    .update(key.clone(), value.into(), next_rev)
+                    .await
+                {
+                    Ok(correct_revision) => Ok(StorageOutcome::Created(correct_revision)),
+                    Err(err) => Err(StorageError::NATSError(format!(
+                        "Error during update of key {key} after resync: {err}"
+                    ))),
+                }
+            }
+            Ok(None) => {
+                tracing::warn!(key, "Entry does not exist during resync, creating.");
+                self.create(key, value).await
+            }
+            Err(err) => {
+                tracing::error!(key, %err, "Failed fetching entry during resync");
+                Err(StorageError::NATSError(err.to_string()))
+            }
+        }
+    }
+}
+
+/// async-nats won't let us use a multi-part subject to create KV buckets (and probably many other
+/// things).
+fn single_name(namespace: &str, name: &Slug) -> String {
+    format!("{namespace}_{name}")
+}
--- a/lib/llm/src/lib.rs
+++ b/lib/llm/src/lib.rs
@@ -24,6 +24,7 @@ pub mod disagg_router;
 pub mod engines;
 pub mod gguf;
 pub mod http;
+pub mod key_value_store;
 pub mod kv_router;
 pub mod model_card;
 pub mod model_type;

--- a/lib/llm/src/model_card.rs
+++ b/lib/llm/src/model_card.rs
@@ -13,5 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::time::Duration;
+
 pub mod create;
 pub mod model;
+
+// TODO: Do these network/publish related model deployment card values belong here or in a
+// network module?
+
+/// Identify model deployment cards in the key-value store
+pub const BUCKET_NAME: &str = "mdc";
+
+/// Delete model deployment cards that haven't been re-published after this long.
+/// Cleans up if the worker stopped.
+pub const BUCKET_TTL: Duration = Duration::from_secs(5 * 60);
--- a/lib/llm/src/model_card/create.rs
+++ b/lib/llm/src/model_card/create.rs
@@ -75,8 +75,8 @@ impl ModelDeploymentCard {
        Ok(Self {
            display_name: model_name.to_string(),
            service_name: model_name.to_string(),
-            model_info: ModelInfoType::GGUF(gguf_file.to_path_buf()),
-            tokenizer: TokenizerKind::from_gguf(gguf_file)?,
+            model_info: Some(ModelInfoType::GGUF(gguf_file.to_path_buf())),
+            tokenizer: Some(TokenizerKind::from_gguf(gguf_file)?),
            prompt_formatter: Some(PromptFormatterArtifact::GGUF(gguf_file.to_path_buf())),
            prompt_context: None, // TODO - auto-detect prompt context
            revision: 0,
@@ -97,8 +97,8 @@ impl ModelDeploymentCard {
        Ok(Self {
            display_name: model_name.to_string(),
            service_name: model_name.to_string(),
-            model_info: ModelInfoType::from_repo(repo_id).await?,
-            tokenizer: TokenizerKind::from_repo(repo_id).await?,
+            model_info: Some(ModelInfoType::from_repo(repo_id).await?),
+            tokenizer: Some(TokenizerKind::from_repo(repo_id).await?),
            prompt_formatter: PromptFormatterArtifact::from_repo(repo_id).await?,
            prompt_context: None, // TODO - auto-detect prompt context
            revision: 0,

--- a/lib/llm/src/model_card/model.rs
+++ b/lib/llm/src/model_card/model.rs
@@ -34,11 +34,14 @@ use std::time::Duration;
 use anyhow::{Context, Result};
 use derive_builder::Builder;
 use dynamo_runtime::slug::Slug;
+use dynamo_runtime::transports::nats;
 use either::Either;
 use serde::{Deserialize, Serialize};
 use tokenizers::Tokenizer as HfTokenizer;
+use url::Url;

 use crate::gguf::{Content, ContentConfig};
+use crate::key_value_store::Versioned;
 use crate::protocols::TokenIdType;

 pub const BUCKET_NAME: &str = "mdc";
@@ -93,7 +96,7 @@ pub enum PromptContextMixin {
    Llama3DateTime,
 }

-#[derive(Serialize, Deserialize, Clone, Debug, Builder)]
+#[derive(Serialize, Deserialize, Clone, Debug, Builder, Default)]
 pub struct ModelDeploymentCard {
    /// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
    pub display_name: String,
@@ -103,10 +106,10 @@ pub struct ModelDeploymentCard {
    pub service_name: String,

    /// Model information
-    pub model_info: ModelInfoType,
+    pub model_info: Option<ModelInfoType>,

    /// Tokenizer configuration
-    pub tokenizer: TokenizerKind,
+    pub tokenizer: Option<TokenizerKind>,

    /// Prompt Formatter configuration
    #[serde(default, skip_serializing_if = "Option::is_none")]
@@ -135,6 +138,19 @@ impl ModelDeploymentCard {
        ModelDeploymentCardBuilder::default()
    }

+    /// Create a ModelDeploymentCard where only the name is filled in.
+    ///
+    /// Single-process setups don't need an MDC to communicate model details, but it
+    /// simplifies the code to assume we always have one. This is how you get one in those
+    /// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
+    pub fn with_name_only(name: &str) -> ModelDeploymentCard {
+        ModelDeploymentCard {
+            display_name: name.to_string(),
+            service_name: Slug::from_string(name).to_string(),
+            ..Default::default()
+        }
+    }
+
    /// A URL and NATS friendly and very likely unique ID for this model.
    /// Mostly human readable. a-z, 0-9, _ and - only.
    /// Pass the service_name.
@@ -142,10 +158,6 @@ impl ModelDeploymentCard {
        Slug::from_string(s)
    }

-    pub fn set_service_name(&mut self, service_name: &str) {
-        self.service_name = service_name.to_string();
-    }
-
    /// How often we should check if a model deployment card expired because it's workers are gone
    pub fn expiry_check_period() -> Duration {
        match CARD_MAX_AGE.to_std() {
@@ -157,10 +169,6 @@ impl ModelDeploymentCard {
        }
    }

-    pub fn slug(&self) -> Slug {
-        ModelDeploymentCard::service_name_slug(&self.service_name)
-    }
-
    /// Load a model deployment card from a JSON file
    pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
        let mut card: ModelDeploymentCard = serde_json::from_str(&std::fs::read_to_string(file)?)?;
@@ -173,12 +181,24 @@ impl ModelDeploymentCard {
        Ok(serde_json::from_str(json)?)
    }

+    //
+    // Methods
+    //
+
    /// Save the model deployment card to a JSON file
    pub fn save_to_json_file(&self, file: &str) -> Result<(), anyhow::Error> {
        std::fs::write(file, self.to_json()?)?;
        Ok(())
    }

+    pub fn set_service_name(&mut self, service_name: &str) {
+        self.service_name = service_name.to_string();
+    }
+
+    pub fn slug(&self) -> Slug {
+        ModelDeploymentCard::service_name_slug(&self.service_name)
+    }
+
    /// Serialize the model deployment card to a JSON string
    pub fn to_json(&self) -> Result<String, anyhow::Error> {
        Ok(serde_json::to_string(self)?)
@@ -200,12 +220,87 @@ impl ModelDeploymentCard {

    pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
        match &self.tokenizer {
-            TokenizerKind::HfTokenizerJson(file) => {
+            Some(TokenizerKind::HfTokenizerJson(file)) => {
                HfTokenizer::from_file(file).map_err(anyhow::Error::msg)
            }
-            TokenizerKind::GGUF(t) => Ok(*t.clone()),
+            Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
+            None => {
+                anyhow::bail!("Blank ModelDeploymentCard does not have a tokenizer");
            }
        }
+    }
+
+    /// Move the files this MDC uses into the NATS object store.
+    /// Updates the URI's to point to NATS.
+    pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
+        let nats_addr = nats_client.addr();
+        let bucket_name = self.slug();
+        tracing::debug!(
+            nats_addr,
+            %bucket_name,
+            "Uploading model deployment card to NATS"
+        );
+
+        if let Some(ModelInfoType::HfConfigJson(ref src_file)) = self.model_info {
+            if !nats::is_nats_url(src_file) {
+                let target = format!("nats://{nats_addr}/{bucket_name}/config.json");
+                nats_client
+                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
+                    .await?;
+                self.model_info = Some(ModelInfoType::HfConfigJson(target));
+            }
+        }
+
+        if let Some(PromptFormatterArtifact::HfTokenizerConfigJson(ref src_file)) =
+            self.prompt_formatter
+        {
+            if !nats::is_nats_url(src_file) {
+                let target = format!("nats://{nats_addr}/{bucket_name}/tokenizer_config.json");
+                nats_client
+                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
+                    .await?;
+                self.prompt_formatter =
+                    Some(PromptFormatterArtifact::HfTokenizerConfigJson(target));
+            }
+        }
+
+        if let Some(TokenizerKind::HfTokenizerJson(ref src_file)) = self.tokenizer {
+            if !nats::is_nats_url(src_file) {
+                let target = format!("nats://{nats_addr}/{bucket_name}/tokenizer.json");
+                nats_client
+                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
+                    .await?;
+                self.tokenizer = Some(TokenizerKind::HfTokenizerJson(target));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Delete this card from the key-value store and it's URLs from the object store
+    pub async fn delete_from_nats(&mut self, nats_client: nats::Client) -> Result<()> {
+        let nats_addr = nats_client.addr();
+        let bucket_name = self.slug();
+        tracing::trace!(
+            nats_addr,
+            %bucket_name,
+            "Delete model deployment card from NATS"
+        );
+        nats_client
+            .object_store_delete_bucket(bucket_name.as_ref())
+            .await
+    }
+}
+
+impl Versioned for ModelDeploymentCard {
+    fn revision(&self) -> u64 {
+        self.revision
+    }
+
+    fn set_revision(&mut self, revision: u64) {
+        self.last_published = Some(chrono::Utc::now());
+        self.revision = revision;
+    }
 }

 impl fmt::Display for ModelDeploymentCard {

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -69,20 +69,29 @@ pub struct OpenAIPreprocessor {

 impl OpenAIPreprocessor {
    pub async fn new(mdc: ModelDeploymentCard) -> Result<Arc<Self>> {
+        let mdcsum = mdc.mdcsum();
        let formatter = PromptFormatter::from_mdc(mdc.clone()).await?;
        let PromptFormatter::OAI(formatter) = formatter;

        let tokenizer = match &mdc.tokenizer {
-            TokenizerKind::HfTokenizerJson(file) => HuggingFaceTokenizer::from_file(file)?,
-            TokenizerKind::GGUF(tokenizer) => {
+            Some(TokenizerKind::HfTokenizerJson(file)) => HuggingFaceTokenizer::from_file(file)?,
+            Some(TokenizerKind::GGUF(tokenizer)) => {
                HuggingFaceTokenizer::from_tokenizer(*tokenizer.clone())
            }
+            None => {
+                anyhow::bail!(
+                    "Blank ModelDeploymentCard cannot be used for pre-processing, no tokenizer"
+                );
+            }
        };
        let tokenizer = Arc::new(tokenizer);

-        let model_info = mdc.model_info.get_model_info().await?;
-
-        let mdcsum = mdc.mdcsum();
+        let Some(model_info) = mdc.model_info else {
+            anyhow::bail!(
+                "Blank ModelDeploymentCard cannot be used for pre-processing, no model_info"
+            );
+        };
+        let model_info = model_info.get_model_info().await?;

        Ok(Arc::new(Self {
            formatter,

--- a/lib/llm/tests/backend.rs
+++ b/lib/llm/tests/backend.rs
@@ -24,11 +24,11 @@ async fn test_sequence_factory() {

    let operator = Backend::from_mdc(mdc).await.unwrap();

-    let mut decode_stream = operator.tokenizer.decode_stream(false);
+    let mut decode_stream = operator.tokenizer.as_ref().unwrap().decode_stream(false);
    let output = decode_stream.step(1).unwrap();
    assert_eq!(output, Some("<s>".to_string()));

-    let mut decode_stream = operator.tokenizer.decode_stream(true);
+    let mut decode_stream = operator.tokenizer.as_ref().unwrap().decode_stream(true);
    let output = decode_stream.step(1).unwrap();
    assert_eq!(output, None);
 }
--- a/lib/llm/tests/model_card.rs
+++ b/lib/llm/tests/model_card.rs
@@ -23,7 +23,7 @@ async fn test_model_info_from_hf_like_local_repo() {
    let mdc = ModelDeploymentCard::from_local_path(HF_PATH, None)
        .await
        .unwrap();
-    let info = mdc.model_info.get_model_info().await.unwrap();
+    let info = mdc.model_info.unwrap().get_model_info().await.unwrap();
    assert_eq!(info.model_type(), "llama");
    assert_eq!(info.bos_token_id(), 1);
    assert_eq!(info.eos_token_ids(), vec![2]);
@@ -44,7 +44,7 @@ async fn test_tokenizer_from_hf_like_local_repo() {
        .await
        .unwrap();
    // Verify tokenizer file was found
-    match mdc.tokenizer {
+    match mdc.tokenizer.unwrap() {
        TokenizerKind::HfTokenizerJson(_) => (),
        TokenizerKind::GGUF(_) => (),
    }

--- a/lib/runtime/Cargo.toml
+++ b/lib/runtime/Cargo.toml
@@ -43,6 +43,7 @@ derive-getters = { workspace = true }
 either = { workspace = true }
 futures = { workspace = true }
 humantime = { workspace = true }
+prometheus = { workspace = true }
 rand = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
@@ -51,26 +52,24 @@ tokio-stream = { workspace = true }
 tokio-util = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
-validator = { workspace = true }
+thiserror = { workspace = true }
 uuid = { workspace = true }
+url = { workspace = true }
+validator = { workspace = true }
 xxhash-rust = { workspace = true }
-prometheus = { workspace = true }
-thiserror = { workspace = true }
-
-# Dependencies not yet in workspace
-figment = { version = "0.10.19", features = ["env", "json", "toml", "test"] }
-log = { version = "0.4" }
-once_cell = { version = "1" }
-regex = { version = "1" }
-socket2 = { version = "0.5.8" }

 async-once-cell = { version = "0.5.4" }
 educe = { version = "0.6.0" }
 etcd-client = { version = "0.14" }
+figment = { version = "0.10.19", features = ["env", "json", "toml", "test"] }
 local-ip-address = { version = "0.6.3" }
+log = { version = "0.4" }
 nid = { version = "3.0.0", features = ["serde"] }
 nix = { version = "0.29", features = ["signal"] }
 nuid = { version = "0.5" }
+once_cell = { version = "1" }
+regex = { version = "1" }
+socket2 = { version = "0.5.8" }

 [dev-dependencies]
 assert_matches = { version = "1.5.0" }

--- a/lib/runtime/examples/Cargo.lock
+++ b/lib/runtime/examples/Cargo.lock
@@ -660,6 +660,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "url",
 "uuid",
 "validator",
 "xxhash-rust",
@@ -2921,6 +2922,7 @@ dependencies = [
 "form_urlencoded",
 "idna",
 "percent-encoding",
+ "serde",
 ]

 [[package]]